Residual Connection의 성능 및 효과와 Transformer에서의 Residual Connection
![]()
레지듀얼 커넥션에 대한 이야기를 해보려고 합니다.
![]()
레지듀얼 커넥션에 대한 이야기를 해보려고 합니다.
이미지의 가장 왼쪽 위부터 가장 오른쪽까지 순차적으로 훑게 됨.
채널 간 합성곱 연산을 마친 이후에 그 결과를 모두 더해서 하나의 채널을 가지는 특성맵을 만든다.
from tensorflow.keras import datasets
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab_size = 10000
(X_train, y_train), (X_test, y_test) = datasets.imdb.load_data(num_words = vocab_size)
print(X_train[:5])
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz 17465344/17464789 [==============================] - 0s 0us/step [list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]) list([1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463, 4369, 5012, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 8163, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 6853, 5, 163, 11, 3215, 2, 4, 1153, 9, 194, 775, 7, 8255, 2, 349, 2637, 148, 605, 2, 8003, 15, 123, 125, 68, 2, 6853, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 8255, 5, 2, 656, 245, 2350, 5, 4, 9837, 131, 152, 491, 18, 2, 32, 7464, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]) list([1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5974, 54, 61, 369, 13, 71, 149, 14, 22, 112, 4, 2401, 311, 12, 16, 3711, 33, 75, 43, 1829, 296, 4, 86, 320, 35, 534, 19, 263, 4821, 1301, 4, 1873, 33, 89, 78, 12, 66, 16, 4, 360, 7, 4, 58, 316, 334, 11, 4, 1716, 43, 645, 662, 8, 257, 85, 1200, 42, 1228, 2578, 83, 68, 3912, 15, 36, 165, 1539, 278, 36, 69, 2, 780, 8, 106, 14, 6905, 1338, 18, 6, 22, 12, 215, 28, 610, 40, 6, 87, 326, 23, 2300, 21, 23, 22, 12, 272, 40, 57, 31, 11, 4, 22, 47, 6, 2307, 51, 9, 170, 23, 595, 116, 595, 1352, 13, 191, 79, 638, 89, 2, 14, 9, 8, 106, 607, 624, 35, 534, 6, 227, 7, 129, 113]) list([1, 4, 2, 2, 33, 2804, 4, 2040, 432, 111, 153, 103, 4, 1494, 13, 70, 131, 67, 11, 61, 2, 744, 35, 3715, 761, 61, 5766, 452, 9214, 4, 985, 7, 2, 59, 166, 4, 105, 216, 1239, 41, 1797, 9, 15, 7, 35, 744, 2413, 31, 8, 4, 687, 23, 4, 2, 7339, 6, 3693, 42, 38, 39, 121, 59, 456, 10, 10, 7, 265, 12, 575, 111, 153, 159, 59, 16, 1447, 21, 25, 586, 482, 39, 4, 96, 59, 716, 12, 4, 172, 65, 9, 579, 11, 6004, 4, 1615, 5, 2, 7, 5168, 17, 13, 7064, 12, 19, 6, 464, 31, 314, 11, 2, 6, 719, 605, 11, 8, 202, 27, 310, 4, 3772, 3501, 8, 2722, 58, 10, 10, 537, 2116, 180, 40, 14, 413, 173, 7, 263, 112, 37, 152, 377, 4, 537, 263, 846, 579, 178, 54, 75, 71, 476, 36, 413, 263, 2504, 182, 5, 17, 75, 2306, 922, 36, 279, 131, 2895, 17, 2867, 42, 17, 35, 921, 2, 192, 5, 1219, 3890, 19, 2, 217, 4122, 1710, 537, 2, 1236, 5, 736, 10, 10, 61, 403, 9, 2, 40, 61, 4494, 5, 27, 4494, 159, 90, 263, 2311, 4319, 309, 8, 178, 5, 82, 4319, 4, 65, 15, 9225, 145, 143, 5122, 12, 7039, 537, 746, 537, 537, 15, 7979, 4, 2, 594, 7, 5168, 94, 9096, 3987, 2, 11, 2, 4, 538, 7, 1795, 246, 2, 9, 2, 11, 635, 14, 9, 51, 408, 12, 94, 318, 1382, 12, 47, 6, 2683, 936, 5, 6307, 2, 19, 49, 7, 4, 1885, 2, 1118, 25, 80, 126, 842, 10, 10, 2, 2, 4726, 27, 4494, 11, 1550, 3633, 159, 27, 341, 29, 2733, 19, 4185, 173, 7, 90, 2, 8, 30, 11, 4, 1784, 86, 1117, 8, 3261, 46, 11, 2, 21, 29, 9, 2841, 23, 4, 1010, 2, 793, 6, 2, 1386, 1830, 10, 10, 246, 50, 9, 6, 2750, 1944, 746, 90, 29, 2, 8, 124, 4, 882, 4, 882, 496, 27, 2, 2213, 537, 121, 127, 1219, 130, 5, 29, 494, 8, 124, 4, 882, 496, 4, 341, 7, 27, 846, 10, 10, 29, 9, 1906, 8, 97, 6, 236, 2, 1311, 8, 4, 2, 7, 31, 7, 2, 91, 2, 3987, 70, 4, 882, 30, 579, 42, 9, 12, 32, 11, 537, 10, 10, 11, 14, 65, 44, 537, 75, 2, 1775, 3353, 2, 1846, 4, 2, 7, 154, 5, 4, 518, 53, 2, 2, 7, 3211, 882, 11, 399, 38, 75, 257, 3807, 19, 2, 17, 29, 456, 4, 65, 7, 27, 205, 113, 10, 10, 2, 4, 2, 2, 9, 242, 4, 91, 1202, 2, 5, 2070, 307, 22, 7, 5168, 126, 93, 40, 2, 13, 188, 1076, 3222, 19, 4, 2, 7, 2348, 537, 23, 53, 537, 21, 82, 40, 2, 13, 2, 14, 280, 13, 219, 4, 2, 431, 758, 859, 4, 953, 1052, 2, 7, 5991, 5, 94, 40, 25, 238, 60, 2, 4, 2, 804, 2, 7, 4, 9941, 132, 8, 67, 6, 22, 15, 9, 283, 8, 5168, 14, 31, 9, 242, 955, 48, 25, 279, 2, 23, 12, 1685, 195, 25, 238, 60, 796, 2, 4, 671, 7, 2804, 5, 4, 559, 154, 888, 7, 726, 50, 26, 49, 7008, 15, 566, 30, 579, 21, 64, 2574]) list([1, 249, 1323, 7, 61, 113, 10, 10, 13, 1637, 14, 20, 56, 33, 2401, 18, 457, 88, 13, 2626, 1400, 45, 3171, 13, 70, 79, 49, 706, 919, 13, 16, 355, 340, 355, 1696, 96, 143, 4, 22, 32, 289, 7, 61, 369, 71, 2359, 5, 13, 16, 131, 2073, 249, 114, 249, 229, 249, 20, 13, 28, 126, 110, 13, 473, 8, 569, 61, 419, 56, 429, 6, 1513, 18, 35, 534, 95, 474, 570, 5, 25, 124, 138, 88, 12, 421, 1543, 52, 725, 6397, 61, 419, 11, 13, 1571, 15, 1543, 20, 11, 4, 2, 5, 296, 12, 3524, 5, 15, 421, 128, 74, 233, 334, 207, 126, 224, 12, 562, 298, 2167, 1272, 7, 2601, 5, 516, 988, 43, 8, 79, 120, 15, 595, 13, 784, 25, 3171, 18, 165, 170, 143, 19, 14, 5, 7224, 6, 226, 251, 7, 61, 113])]
max_len = 200
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)
print('X_train의 크기(shape) :',X_train.shape)
print('X_test의 크기(shape) :',X_test.shape)
print(y_train[:5])
X_train의 크기(shape) : (25000, 200) X_test의 크기(shape) : (25000, 200) [1 0 0 1 0]
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
embedding_dim = 256
batch_size = 256
model = Sequential()
model.add(Embedding(vocab_size, 256))
model.add(Dropout(0.3))
model.add(Conv1D(256, 3, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3)
mc = ModelCheckpoint('best_model.h5', monitor = 'val_acc', mode = 'max', verbose = 1, save_best_only = True)
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics = ['acc'])
history = model.fit(X_train, y_train, epochs = 20, validation_data = (X_test, y_test), callbacks=[es, mc])
loaded_model = load_model('best_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))
782/782 [==============================] - 38s 48ms/step - loss: 0.2827 - acc: 0.8812 테스트 정확도: 0.8812
import urllib.request
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
urllib.request.urlretrieve("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", filename="spam.csv")
data = pd.read_csv('spam.csv', encoding='latin-1')
print('총 샘플의 수 :',len(data))
data[:5]
총 샘플의 수 : 5572
| v1 | v2 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | |
|---|---|---|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... | NaN | NaN | NaN |
| 1 | ham | Ok lar... Joking wif u oni... | NaN | NaN | NaN |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | NaN | NaN | NaN |
| 3 | ham | U dun say so early hor... U c already then say... | NaN | NaN | NaN |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... | NaN | NaN | NaN |
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']
data['v1'] = data['v1'].replace(['ham','spam'],[0,1])
data['v2'].nunique(), data['v1'].nunique()
data.drop_duplicates(subset=['v2'], inplace=True) # v2 열에서 중복인 내용이 있다면 중복 제거
print('총 샘플의 수 :',len(data))
총 샘플의 수 : 5169
data['v1'].value_counts().plot(kind='bar');
print(data.groupby('v1').size().reset_index(name='count'))
v1 count 0 0 4516 1 1 653
X_data = data['v2']
y_data = data['v1']
print('메일 본문의 개수: {}'.format(len(X_data)))
print('레이블의 개수: {}'.format(len(y_data)))
메일 본문의 개수: 5169 레이블의 개수: 5169
# 정수인코딩
vocab_size = 1000
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X_data) # 5169개의 행을 가진 X의 각 행에 토큰화를 수행
sequences = tokenizer.texts_to_sequences(X_data) # 단어를 숫자값, 인덱스로 변환하여 저장
print(sequences[:5])
[[47, 433, 780, 705, 662, 64, 8, 94, 121, 434, 142, 68, 57, 137], [49, 306, 435, 6], [53, 537, 8, 20, 4, 934, 2, 220, 706, 267, 70, 2, 2, 359, 537, 604, 82, 436, 185, 707, 437], [6, 226, 152, 23, 347, 6, 138, 145, 56, 152], [935, 1, 97, 96, 69, 453, 2, 877, 69, 198, 105, 438]]
n_of_train = int(len(sequences) * 0.8)
n_of_test = int(len(sequences) - n_of_train)
print('훈련 데이터의 개수 :',n_of_train)
print('테스트 데이터의 개수:',n_of_test)
훈련 데이터의 개수 : 4135 테스트 데이터의 개수: 1034
X_data = sequences
print('메일의 최대 길이 : %d' % max(len(l) for l in X_data))
print('메일의 평균 길이 : %f' % (sum(map(len, X_data))/len(X_data)))
plt.hist([len(s) for s in X_data], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()
메일의 최대 길이 : 172 메일의 평균 길이 : 12.566841
# 전체 데이터셋의 길이는 max_len으로 맞춥니다.
max_len = 172
data = pad_sequences(X_data, maxlen = max_len)
print("훈련 데이터의 크기(shape): ", data.shape)
훈련 데이터의 크기(shape): (5169, 172)
X_test = data[n_of_train:] #X_data 데이터 중에서 뒤의 1034개의 데이터만 저장
y_test = np.array(y_data[n_of_train:]) #y_data 데이터 중에서 뒤의 1034개의 데이터만 저장
X_train = data[:n_of_train] #X_data 데이터 중에서 앞의 4135개의 데이터만 저장
y_train = np.array(y_data[:n_of_train]) #y_data 데이터 중에서 앞의 4135개의 데이터만 저장
print("훈련용 이메일 데이터의 크기(shape): ", X_train.shape)
print("테스트용 이메일 데이터의 크기(shape): ", X_test.shape)
print("훈련용 레이블의 크기(shape): ", y_train.shape)
print("테스트용 레이블의 크기(shape): ", y_test.shape)
훈련용 이메일 데이터의 크기(shape): (4135, 172) 테스트용 이메일 데이터의 크기(shape): (1034, 172) 훈련용 레이블의 크기(shape): (4135,) 테스트용 레이블의 크기(shape): (1034,)
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
model = Sequential()
model.add(Embedding(vocab_size, 32))
model.add(Dropout(0.2))
model.add(Conv1D(32, 5, strides=1, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3)
mc = ModelCheckpoint('best_model.h5', monitor = 'val_acc', mode = 'max', verbose = 1, save_best_only = True)
history = model.fit(X_train, y_train, epochs = 10, batch_size=64, validation_split=0.2, callbacks=[es, mc])
Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_1 (Embedding) (None, None, 32) 32000 _________________________________________________________________ dropout_2 (Dropout) (None, None, 32) 0 _________________________________________________________________ conv1d_1 (Conv1D) (None, None, 32) 5152 _________________________________________________________________ global_max_pooling1d_1 (Glob (None, 32) 0 _________________________________________________________________ dense_2 (Dense) (None, 64) 2112 _________________________________________________________________ dropout_3 (Dropout) (None, 64) 0 _________________________________________________________________ dense_3 (Dense) (None, 1) 65 ================================================================= Total params: 39,329 Trainable params: 39,329 Non-trainable params: 0 _________________________________________________________________ Epoch 1/10 50/52 [===========================>..] - ETA: 0s - loss: 0.4634 - acc: 0.8697 Epoch 00001: val_acc improved from -inf to 0.87304, saving model to best_model.h5 52/52 [==============================] - 1s 27ms/step - loss: 0.4612 - acc: 0.8694 - val_loss: 0.3812 - val_acc: 0.8730 Epoch 2/10 51/52 [============================>.] - ETA: 0s - loss: 0.3565 - acc: 0.8689 Epoch 00002: val_acc did not improve from 0.87304 52/52 [==============================] - 1s 25ms/step - loss: 0.3542 - acc: 0.8697 - val_loss: 0.2762 - val_acc: 0.8730 Epoch 3/10 51/52 [============================>.] - ETA: 0s - loss: 0.1499 - acc: 0.9406 Epoch 00003: val_acc improved from 0.87304 to 0.98549, saving model to best_model.h5 52/52 [==============================] - 1s 26ms/step - loss: 0.1496 - acc: 0.9407 - val_loss: 0.0781 - val_acc: 0.9855 Epoch 4/10 51/52 [============================>.] - ETA: 0s - loss: 0.0528 - acc: 0.9881 Epoch 00004: val_acc did not improve from 0.98549 52/52 [==============================] - 1s 25ms/step - loss: 0.0530 - acc: 0.9876 - val_loss: 0.0559 - val_acc: 0.9843 Epoch 5/10 51/52 [============================>.] - ETA: 0s - loss: 0.0346 - acc: 0.9911 Epoch 00005: val_acc did not improve from 0.98549 52/52 [==============================] - 1s 25ms/step - loss: 0.0344 - acc: 0.9912 - val_loss: 0.0529 - val_acc: 0.9807 Epoch 6/10 50/52 [===========================>..] - ETA: 0s - loss: 0.0218 - acc: 0.9928 Epoch 00006: val_acc did not improve from 0.98549 52/52 [==============================] - 1s 26ms/step - loss: 0.0213 - acc: 0.9930 - val_loss: 0.0516 - val_acc: 0.9807 Epoch 7/10 51/52 [============================>.] - ETA: 0s - loss: 0.0139 - acc: 0.9969 Epoch 00007: val_acc did not improve from 0.98549 52/52 [==============================] - 1s 26ms/step - loss: 0.0138 - acc: 0.9970 - val_loss: 0.0530 - val_acc: 0.9807 Epoch 8/10 51/52 [============================>.] - ETA: 0s - loss: 0.0092 - acc: 0.9982 Epoch 00008: val_acc did not improve from 0.98549 52/52 [==============================] - 1s 26ms/step - loss: 0.0091 - acc: 0.9982 - val_loss: 0.0533 - val_acc: 0.9807 Epoch 9/10 51/52 [============================>.] - ETA: 0s - loss: 0.0081 - acc: 0.9982 Epoch 00009: val_acc did not improve from 0.98549 52/52 [==============================] - 1s 27ms/step - loss: 0.0080 - acc: 0.9982 - val_loss: 0.0542 - val_acc: 0.9807 Epoch 00009: early stopping
print("\n 테스트 정확도: %.4f" % (model.evaluate(X_test, y_test)[1]))
33/33 [==============================] - 0s 3ms/step - loss: 0.0706 - acc: 0.9787 테스트 정확도: 0.9787
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense, Input, Flatten, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
# 하이퍼 파라미터 정의
embedding_dim = 128
dropout_prob = (0.5, 0.8)
num_filters = 128
# 입력층과 임베딩 층 정의 -> 50% 드롭아웃
model_input = Input(shape = (max_len,))
z = Embedding(vocab_size, embedding_dim, input_length = max_len, name="embedding")(model_input)
z = Dropout(dropout_prob[0])(z)
# maxpooling
conv_blocks = []
for sz in [3, 4, 5]:
conv = Conv1D(filters = num_filters,
kernel_size = sz,
padding = "valid",
activation = "relu",
strides = 1)(z)
conv = GlobalMaxPooling1D()(conv)
conv = Flatten()(conv)
conv_blocks.append(conv)
# dense layer로 연결
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = Dropout(dropout_prob[1])(z)
z = Dense(128, activation="relu")(z)
model_output = Dense(1, activation="sigmoid")(z)
model = Model(model_input, model_output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])
# 이진분류 시행
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('CNN_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
model.fit(X_train, y_train, batch_size = 64, epochs=10, validation_data = (X_test, y_test), verbose=2, callbacks=[es, mc])
# 모델 로드, 테스트
loaded_model = load_model('CNN_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))
33/33 [==============================] - 1s 39ms/step - loss: 0.3654 - acc: 0.8868 테스트 정확도: 0.8868
from konlpy.tag import *
okt = Okt()
import nltk
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
True
def sentiment_predict(new_sentence):
new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
new_sentence = [word for word in new_sentence if not word in stopwords.words('english')] # 불용어 제거
encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
score = float(model.predict(pad_new)) # 예측
if(score > 0.5):
print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score * 100))
else:
print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1 - score) * 100))
sentiment_predict('이 영화 개꿀잼 ㅋㅋㅋ')
86.70% 확률로 부정 리뷰입니다.
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
import urllib.request
urllib.request.urlretrieve("https://github.com/ajinkyaT/CNN_Intent_Classification/raw/master/data/train_text.npy", filename="train_text.npy")
urllib.request.urlretrieve("https://github.com/ajinkyaT/CNN_Intent_Classification/raw/master/data/test_text.npy", filename="test_text.npy")
urllib.request.urlretrieve("https://github.com/ajinkyaT/CNN_Intent_Classification/raw/master/data/train_label.npy", filename="train_label.npy")
urllib.request.urlretrieve("https://github.com/ajinkyaT/CNN_Intent_Classification/raw/master/data/test_label.npy", filename="test_label.npy")
('test_label.npy', <http.client.HTTPMessage at 0x7f604dd1d6a0>)
old = np.load
np.load = lambda *a,**k: old(*a,allow_pickle=True,**k)
intent_train = np.load(open('train_text.npy', 'rb')).tolist()
label_train = np.load(open('train_label.npy', 'rb')).tolist()
intent_test = np.load(open('test_text.npy', 'rb')).tolist()
label_test = np.load(open('test_label.npy', 'rb')).tolist()
print('훈련용 문장의 수 :', len(intent_train))
print('훈련용 레이블의 수 :', len(label_train))
print('테스트용 문장의 수 :', len(intent_test))
print('테스트용 레이블의 수 :', len(label_test))
훈련용 문장의 수 : 11784 훈련용 레이블의 수 : 11784 테스트용 문장의 수 : 600 테스트용 레이블의 수 : 600
print(intent_train[:5])
print(label_train[:5])
print(intent_train[2000:2002])
print(label_train[2000:2002])
print(intent_train[4000:4002])
print(label_train[4000:4002])
print(intent_train[6000:6002])
print(label_train[6000:6002])
print(intent_train[8000:8002])
print(label_train[8000:8002])
print(intent_train[10000:10002])
print(label_train[10000:10002])
['add another song to the cita rom ntica playlist', 'add clem burke in my playlist pre party r b jams', 'add live from aragon ballroom to trapeo', 'add unite and win to my night out', 'add track to my digster future hits'] ['AddToPlaylist', 'AddToPlaylist', 'AddToPlaylist', 'AddToPlaylist', 'AddToPlaylist'] ['please book reservations for 3 people at a restaurant in alderwood manor', 'book a table in mt for 3 for now at a pub that serves south indian'] ['BookRestaurant', 'BookRestaurant'] ['what will the weather be like on feb 8 , 2034 in cedar mountain wilderness', "tell me the forecast in the same area here on robert e lee 's birthday"] ['GetWeather', 'GetWeather'] ['rate the current album one points', 'i give a zero rating for this essay'] ['RateBook', 'RateBook'] ["i'm trying to find the show chant ii", 'find spirit of the bush'] ['SearchCreativeWork', 'SearchCreativeWork'] ['when is blood and ice cream trilogie playing at the nearest movie theatre \\?', 'show movie schedules'] ['SearchScreeningEvent', 'SearchScreeningEvent']
temp = pd.Series(label_train)
temp.value_counts().plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x7f604dd94278>
# 레이블 인코딩. 레이블에 고유한 정수를 부여
idx_encode = preprocessing.LabelEncoder()
idx_encode.fit(label_train)
label_train = idx_encode.transform(label_train) # 주어진 고유한 정수로 변환
label_test = idx_encode.transform(label_test) # 고유한 정수로 변환
label_idx = dict(zip(list(idx_encode.classes_), idx_encode.transform(list(idx_encode.classes_))))
print(label_idx)
{'AddToPlaylist': 0, 'BookRestaurant': 1, 'GetWeather': 2, 'RateBook': 3, 'SearchCreativeWork': 4, 'SearchScreeningEvent': 5}
print(intent_train[:5])
print(label_train[:5])
print(intent_test[:5])
print(label_test[:5])
['add another song to the cita rom ntica playlist', 'add clem burke in my playlist pre party r b jams', 'add live from aragon ballroom to trapeo', 'add unite and win to my night out', 'add track to my digster future hits'] [0 0 0 0 0] ["i 'd like to have this track onto my classical relaxations playlist", 'add the album to my flow espa ol playlist', 'add digging now to my young at heart playlist', 'add this song by too poetic to my piano ballads playlist', 'add this album to old school death metal'] [0 0 0 0 0]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(intent_train)
sequences = tokenizer.texts_to_sequences(intent_train)
sequences[:5] # 상위 5개 샘플 출력
[[11, 191, 61, 4, 1, 4013, 1141, 1572, 15], [11, 2624, 1573, 3, 14, 15, 939, 82, 256, 188, 548], [11, 187, 42, 2625, 4014, 4, 1968], [11, 2626, 22, 2627, 4, 14, 192, 27], [11, 92, 4, 14, 651, 520, 195]]
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print('단어 집합(Vocabulary)의 크기 :',vocab_size)
단어 집합(Vocabulary)의 크기 : 9870
print('문장의 최대 길이 :',max(len(l) for l in sequences))
print('문장의 평균 길이 :',sum(map(len, sequences))/len(sequences))
plt.hist([len(s) for s in sequences], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()
문장의 최대 길이 : 35 문장의 평균 길이 : 9.364392396469789
max_len = 35
intent_train = pad_sequences(sequences, maxlen = max_len)
label_train = to_categorical(np.asarray(label_train))
print('전체 데이터의 크기(shape):', intent_train.shape)
print('레이블 데이터의 크기(shape):', label_train.shape)
전체 데이터의 크기(shape): (11784, 35) 레이블 데이터의 크기(shape): (11784, 6)
print(intent_train[0])
print(label_train[0])
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 11 191
61 4 1 4013 1141 1572 15]
[1. 0. 0. 0. 0. 0.]
indices = np.arange(intent_train.shape[0])
np.random.shuffle(indices)
print(indices)
intent_train = intent_train[indices]
label_train = label_train[indices]
n_of_val = int(0.1 * intent_train.shape[0])
print(n_of_val)
[3107 310 9113 ... 9998 3835 4179] 1178
X_train = intent_train[:-n_of_val]
y_train = label_train[:-n_of_val]
X_val = intent_train[-n_of_val:]
y_val = label_train[-n_of_val:]
X_test = intent_test
y_test = label_test
print('훈련 데이터의 크기(shape):', X_train.shape)
print('검증 데이터의 크기(shape):', X_val.shape)
print('훈련 데이터 레이블의 개수(shape):', y_train.shape)
print('검증 데이터 레이블의 개수(shape):', y_val.shape)
print('테스트 데이터의 개수 :', len(X_test))
print('테스트 데이터 레이블의 개수 :', len(y_test))
훈련 데이터의 크기(shape): (10606, 35) 검증 데이터의 크기(shape): (1178, 35) 훈련 데이터 레이블의 개수(shape): (10606, 6) 검증 데이터 레이블의 개수(shape): (1178, 6) 테스트 데이터의 개수 : 600 테스트 데이터 레이블의 개수 : 600
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
--2020-08-23 15:11:00-- http://nlp.stanford.edu/data/glove.6B.zip Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140 Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected. HTTP request sent, awaiting response... 302 Found Location: https://nlp.stanford.edu/data/glove.6B.zip [following] --2020-08-23 15:11:00-- https://nlp.stanford.edu/data/glove.6B.zip Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected. HTTP request sent, awaiting response... 301 Moved Permanently Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following] --2020-08-23 15:11:01-- http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22 Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 862182613 (822M) [application/zip] Saving to: ‘glove.6B.zip’ glove.6B.zip 100%[===================>] 822.24M 2.09MB/s in 6m 28s 2020-08-23 15:17:29 (2.12 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613] Archive: glove.6B.zip inflating: glove.6B.50d.txt inflating: glove.6B.100d.txt inflating: glove.6B.200d.txt inflating: glove.6B.300d.txt
embedding_dict = dict()
f = open(os.path.join('glove.6B.100d.txt'), encoding='utf-8')
for line in f:
word_vector = line.split()
word = word_vector[0]
word_vector_arr = np.asarray(word_vector[1:], dtype='float32') # 100개의 값을 가지는 array로 변환
embedding_dict[word] = word_vector_arr
f.close()
print('%s개의 Embedding vector가 있습니다.' % len(embedding_dict))
400000개의 Embedding vector가 있습니다.
print(embedding_dict['respectable'])
print(len(embedding_dict['respectable']))
[-0.049773 0.19903 0.10585 0.1391 -0.32395 0.44053 0.3947 -0.22805 -0.25793 0.49768 0.15384 -0.08831 0.0782 -0.8299 -0.037788 0.16772 -0.45197 -0.17085 0.74756 0.98256 0.81872 0.28507 0.16178 -0.48626 -0.006265 -0.92469 -0.30625 -0.067318 -0.046762 -0.76291 -0.0025264 -0.018795 0.12882 -0.52457 0.3586 0.43119 -0.89477 -0.057421 -0.53724 0.25587 0.55195 0.44698 -0.24252 0.29946 0.25776 -0.8717 0.68426 -0.05688 -0.1848 -0.59352 -0.11227 -0.57692 -0.013593 0.18488 -0.32507 -0.90171 0.17672 0.075601 0.54896 -0.21488 -0.54018 -0.45882 -0.79536 0.26331 0.18879 -0.16363 0.3975 0.1099 0.1164 -0.083499 0.50159 0.35802 0.25677 0.088546 0.42108 0.28674 -0.71285 -0.82915 0.15297 -0.82712 0.022112 1.067 -0.31776 0.1211 -0.069755 -0.61327 0.27308 -0.42638 -0.085084 -0.17694 -0.0090944 0.1109 0.62543 -0.23682 -0.44928 -0.3667 -0.21616 -0.19187 -0.032502 0.38025 ] 100
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
np.shape(embedding_matrix)
(9870, 100)
for word, i in word_index.items():
embedding_vector = embedding_dict.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense, Input, Flatten, Concatenate
filter_sizes = [2,3,5]
num_filters = 512
drop = 0.5
model_input = Input(shape = (max_len,))
z = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix],
input_length=max_len, trainable=False)(model_input)
conv_blocks = []
for sz in filter_sizes:
conv = Conv1D(filters = num_filters,
kernel_size = sz,
padding = "valid",
activation = "relu",
strides = 1)(z)
conv = GlobalMaxPooling1D()(conv)
conv = Flatten()(conv)
conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = Dropout(drop)(z)
model_output = Dense(len(label_idx), activation='softmax')(z)
model = Model(model_input, model_output)
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['acc'])
model.summary()
Model: "functional_3"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_2 (InputLayer) [(None, 35)] 0
__________________________________________________________________________________________________
embedding_2 (Embedding) (None, 35, 100) 987000 input_2[0][0]
__________________________________________________________________________________________________
conv1d_5 (Conv1D) (None, 34, 512) 102912 embedding_2[0][0]
__________________________________________________________________________________________________
conv1d_6 (Conv1D) (None, 33, 512) 154112 embedding_2[0][0]
__________________________________________________________________________________________________
conv1d_7 (Conv1D) (None, 31, 512) 256512 embedding_2[0][0]
__________________________________________________________________________________________________
global_max_pooling1d_5 (GlobalM (None, 512) 0 conv1d_5[0][0]
__________________________________________________________________________________________________
global_max_pooling1d_6 (GlobalM (None, 512) 0 conv1d_6[0][0]
__________________________________________________________________________________________________
global_max_pooling1d_7 (GlobalM (None, 512) 0 conv1d_7[0][0]
__________________________________________________________________________________________________
flatten_3 (Flatten) (None, 512) 0 global_max_pooling1d_5[0][0]
__________________________________________________________________________________________________
flatten_4 (Flatten) (None, 512) 0 global_max_pooling1d_6[0][0]
__________________________________________________________________________________________________
flatten_5 (Flatten) (None, 512) 0 global_max_pooling1d_7[0][0]
__________________________________________________________________________________________________
concatenate_1 (Concatenate) (None, 1536) 0 flatten_3[0][0]
flatten_4[0][0]
flatten_5[0][0]
__________________________________________________________________________________________________
dropout_6 (Dropout) (None, 1536) 0 concatenate_1[0][0]
__________________________________________________________________________________________________
dense_6 (Dense) (None, 6) 9222 dropout_6[0][0]
==================================================================================================
Total params: 1,509,758
Trainable params: 522,758
Non-trainable params: 987,000
__________________________________________________________________________________________________
history = model.fit(X_train, y_train,
batch_size=64,
epochs=10,
validation_data = (X_val, y_val))
epochs = range(1, len(history.history['acc']) + 1)
plt.plot(epochs, history.history['acc'])
plt.plot(epochs, history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epochs')
plt.legend(['train', 'test'], loc='lower right')
plt.show()
epochs = range(1, len(history.history['loss']) + 1)
plt.plot(epochs, history.history['loss'])
plt.plot(epochs, history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(['train', 'test'], loc='upper right')
plt.show()
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=max_len)
y_predicted = model.predict(X_test)
y_predicted = y_predicted.argmax(axis=-1) # 예측된 정수 시퀀스로 변환
y_predicted = idx_encode.inverse_transform(y_predicted) # 정수 시퀀스를 레이블에 해당하는 텍스트 시퀀스로 변환
y_test = idx_encode.inverse_transform(y_test) # 정수 시퀀스를 레이블에 해당하는 텍스트 시퀀스로 변환
print('accuracy: ', sum(y_predicted == y_test) / len(y_test))
print("Precision, Recall and F1-Score:\n\n", classification_report(y_test, y_predicted))
accuracy: 0.98
Precision, Recall and F1-Score:
precision recall f1-score support
AddToPlaylist 1.00 1.00 1.00 100
BookRestaurant 1.00 1.00 1.00 100
GetWeather 0.99 0.99 0.99 100
RateBook 1.00 1.00 1.00 100
SearchCreativeWork 0.91 1.00 0.95 100
SearchScreeningEvent 0.99 0.89 0.94 100
accuracy 0.98 600
macro avg 0.98 0.98 0.98 600
weighted avg 0.98 0.98 0.98 600
품사태거 생성
태깅작업은 대표적인 시퀀스 레이블링임.
양방향 LSTM
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] /root/nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date! [nltk_data] Downloading package maxent_ne_chunker to [nltk_data] /root/nltk_data... [nltk_data] Package maxent_ne_chunker is already up-to-date! [nltk_data] Downloading package words to /root/nltk_data... [nltk_data] Unzipping corpora/words.zip.
True
from nltk import word_tokenize, pos_tag, ne_chunk
sentence = "James is working at Disney in London"
sentence=pos_tag(word_tokenize(sentence))
print(sentence) # 토큰화와 품사 태깅을 동시 수행
[('James', 'NNP'), ('is', 'VBZ'), ('working', 'VBG'), ('at', 'IN'), ('Disney', 'NNP'), ('in', 'IN'), ('London', 'NNP')]
sentence=ne_chunk(sentence)
print(sentence) # 개체명 인식
(S (PERSON James/NNP) is/VBZ working/VBG at/IN (ORGANIZATION Disney/NNP) in/IN (GPE London/NNP))
해 B
리 I
포 I
터 I
보 O
러 O
가 O
자 O
해 B-movie
리 I-movie
포 I-movie
터 I-movie
보 O
러 O
메 B-theater
가 I-theater
박 I-theater
스 I-theater
가 O
자 O
품사 정보
import re
%matplotlib inline
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
# 데이터 전처리
f = open('/content/train.txt', 'r')
tagged_sentences = []
sentence = []
for line in f:
if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n":
if len(sentence) > 0:
tagged_sentences.append(sentence)
sentence = []
continue
splits = line.split(' ') # 공백을 기준으로 속성을 구분한다.
splits[-1] = re.sub(r'\n', '', splits[-1]) # 줄바꿈 표시 \n을 제거한다.
word = splits[0].lower() # 단어들은 소문자로 바꿔서 저장한다.
sentence.append([word, splits[-1]]) # 단어와 개체명 태깅만 기록한다.
print("전체 샘플 개수: ", len(tagged_sentences)) # 전체 샘플의 개수 출력
print(tagged_sentences[0]) # 첫번째 샘플 출력
전체 샘플 개수: 8415 [['eu', 'B-ORG'], ['rejects', 'O'], ['german', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['british', 'B-MISC'], ['lamb', 'O'], ['.', 'O']]
sentences, ner_tags = [], []
for tagged_sentence in tagged_sentences: # 14,041개의 문장 샘플을 1개씩 불러온다.
sentence, tag_info = zip(*tagged_sentence) # 각 샘플에서 단어들은 sentence에 개체명 태깅 정보들은 tag_info에 저장.
sentences.append(list(sentence)) # 각 샘플에서 단어 정보만 저장한다.
ner_tags.append(list(tag_info)) # 각 샘플에서 개체명 태깅 정보만 저장한다.
# 첫번째 문장 샘플 출력
print(sentences[0])
print(ner_tags[0])
# 열세번째 문장 샘플 출력
print(sentences[12])
print(ner_tags[12])
['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.'] ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'] ['only', 'france', 'and', 'britain', 'backed', 'fischler', "'s", 'proposal', '.'] ['O', 'B-LOC', 'O', 'B-LOC', 'O', 'B-PER', 'O', 'O', 'O']
print('샘플의 최대 길이 : %d' % max(len(l) for l in sentences))
print('샘플의 평균 길이 : %f' % (sum(map(len, sentences))/len(sentences)))
plt.hist([len(s) for s in sentences], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()
샘플의 최대 길이 : 60 샘플의 평균 길이 : 13.444801
max_words = 4000
src_tokenizer = Tokenizer(num_words=max_words, oov_token='OOV')
src_tokenizer.fit_on_texts(sentences)
tar_tokenizer = Tokenizer()
tar_tokenizer.fit_on_texts(ner_tags)
vocab_size = max_words
tag_size = len(tar_tokenizer.word_index) + 1
print('단어 집합의 크기 : {}'.format(vocab_size))
print('개체명 태깅 정보 집합의 크기 : {}'.format(tag_size))
# 정수 인코딩
X_train = src_tokenizer.texts_to_sequences(sentences)
y_train = tar_tokenizer.texts_to_sequences(ner_tags)
print(X_train[0])
print(y_train[0])
단어 집합의 크기 : 4000 개체명 태깅 정보 집합의 크기 : 10 [1190, 1, 199, 814, 9, 1, 262, 3734, 3] [3, 1, 7, 1, 1, 1, 7, 1, 1]
# 디코딩 (정수->텍스트)
index_to_word = src_tokenizer.index_word
index_to_ner = tar_tokenizer.index_word
decoded = []
for index in X_train[0] : # 첫번째 샘플 안의 인덱스들에 대해서
decoded.append(index_to_word[index]) # 다시 단어로 변환
print('기존 문장 : {}'.format(sentences[0]))
print('빈도수가 낮은 단어가 OOV 처리된 문장 : {}'.format(decoded))
기존 문장 : ['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.'] 빈도수가 낮은 단어가 OOV 처리된 문장 : ['eu', 'OOV', 'german', 'call', 'to', 'OOV', 'british', 'lamb', '.']
# 패딩
max_len = 70
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
# X_train의 모든 샘플들의 길이를 맞출 때 뒤의 공간에 숫자 0으로 채움.
y_train = pad_sequences(y_train, padding='post', maxlen=max_len)
# y_train의 모든 샘플들의 길이를 맞출 때 뒤의 공간에 숫자0으로 채움.
# testset, trainset 분리
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.2, random_state=777)
# 원-핫인코딩
y_train = to_categorical(y_train, num_classes=tag_size)
y_test = to_categorical(y_test, num_classes=tag_size)
print('훈련 샘플 문장의 크기 : {}'.format(X_train.shape))
print('훈련 샘플 레이블의 크기 : {}'.format(y_train.shape))
print('테스트 샘플 문장의 크기 : {}'.format(X_test.shape))
print('테스트 샘플 레이블의 크기 : {}'.format(y_test.shape))
훈련 샘플 문장의 크기 : (6732, 70) 훈련 샘플 레이블의 크기 : (6732, 70, 10) 테스트 샘플 문장의 크기 : (1683, 70) 테스트 샘플 레이블의 크기 : (1683, 70, 10)
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, TimeDistributed
from keras.optimizers import Adam
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(256, return_sequences=True))) # many-to-many이기 때문에 return_sequences=True
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=128, epochs=8, validation_data=(X_test, y_test))
print("\n 테스트 정확도: %.4f" % (model.evaluate(X_test, y_test)[1]))
14/14 [==============================] - 5s 335ms/step - loss: 0.1408 - accuracy: 0.8274 테스트 정확도: 0.8274
i=10 # 확인하고 싶은 테스트용 샘플의 인덱스.
y_predicted = model.predict(np.array([X_test[i]])) # 입력한 테스트용 샘플에 대해서 예측 y를 리턴
y_predicted = np.argmax(y_predicted, axis=-1) # 원-핫 인코딩을 다시 정수 인코딩으로 변경함.
true = np.argmax(y_test[i], -1) # 원-핫 인코딩을 다시 정수 인코딩으로 변경함.
print("{:15}|{:5}|{}".format("단어", "실제값", "예측값"))
print(35 * "-")
for w, t, pred in zip(X_test[i], true, y_predicted[0]):
if w != 0: # PAD값은 제외함.
print("{:17}: {:7} {}".format(index_to_word[w], index_to_ner[t].upper(), index_to_ner[pred].upper()))
단어 |실제값 |예측값 ----------------------------------- - : O O OOV : O O iraq : B-LOC O for : O O riots : O O in : O O jordan : B-LOC O is : O O a : O O OOV : O O game : O O . : O O
nltk.download('treebank')
[nltk_data] Downloading package treebank to /root/nltk_data... [nltk_data] Unzipping corpora/treebank.zip.
True
import nltk
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
tagged_sentences = nltk.corpus.treebank.tagged_sents() # 토큰화에 품사 태깅이 된 데이터 받아오기
print("품사 태깅이 된 문장 개수: ", len(tagged_sentences)) # 문장 샘플의 개수 출력
품사 태깅이 된 문장 개수: 3914
print(tagged_sentences[0]) # 첫번째 샘플 출력
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
sentences, pos_tags = [], []
for tagged_sentence in tagged_sentences: # 3,914개의 문장 샘플을 1개씩 불러온다.
sentence, tag_info = zip(*tagged_sentence) # 각 샘플에서 단어들은 sentence에 품사 태깅 정보들은 tag_info에 저장한다.
sentences.append(list(sentence)) # 각 샘플에서 단어 정보만 저장한다.
pos_tags.append(list(tag_info)) # 각 샘플에서 품사 태깅 정보만 저장한다.
print(sentences[0])
print(pos_tags[0])
print(sentences[8])
print(pos_tags[8])
print('샘플의 최대 길이 : %d' % max(len(l) for l in sentences))
print('샘플의 평균 길이 : %f' % (sum(map(len, sentences))/len(sentences)))
plt.hist([len(s) for s in sentences], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.'] ['We', "'re", 'talking', 'about', 'years', 'ago', 'before', 'anyone', 'heard', 'of', 'asbestos', 'having', 'any', 'questionable', 'properties', '.'] ['PRP', 'VBP', 'VBG', 'IN', 'NNS', 'IN', 'IN', 'NN', 'VBD', 'IN', 'NN', 'VBG', 'DT', 'JJ', 'NNS', '.'] 샘플의 최대 길이 : 271 샘플의 평균 길이 : 25.722024
def tokenize(samples):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(samples)
return tokenizer
src_tokenizer = tokenize(sentences)
tar_tokenizer = tokenize(pos_tags)
vocab_size = len(src_tokenizer.word_index) + 1
tag_size = len(tar_tokenizer.word_index) + 1
print('단어 집합의 크기 : {}'.format(vocab_size))
print('태깅 정보 집합의 크기 : {}'.format(tag_size))
X_train = src_tokenizer.texts_to_sequences(sentences)
y_train = tar_tokenizer.texts_to_sequences(pos_tags)
print(X_train[:2])
print(y_train[:2])
max_len = 150
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
# X_train의 모든 샘플의 길이를 맞출 때 뒤의 공간에 숫자 0으로 채움.
y_train = pad_sequences(y_train, padding='post', maxlen=max_len)
# y_train의 모든 샘플의 길이를 맞출 때 뒤의 공간에 숫자 0으로 채움.
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.2, random_state=777)
y_train = to_categorical(y_train, num_classes=tag_size)
y_test = to_categorical(y_test, num_classes=tag_size)
print('훈련 샘플 문장의 크기 : {}'.format(X_train.shape))
print('훈련 샘플 레이블의 크기 : {}'.format(y_train.shape))
print('테스트 샘플 문장의 크기 : {}'.format(X_test.shape))
print('테스트 샘플 레이블의 크기 : {}'.format(y_test.shape))
단어 집합의 크기 : 11388 태깅 정보 집합의 크기 : 47 [[5601, 3746, 1, 2024, 86, 331, 1, 46, 2405, 2, 131, 27, 6, 2025, 332, 459, 2026, 3], [31, 3746, 20, 177, 4, 5602, 2915, 1, 2, 2916, 637, 147, 3]] [[3, 3, 8, 10, 6, 7, 8, 21, 13, 4, 1, 2, 4, 7, 1, 3, 10, 9], [3, 3, 17, 1, 2, 3, 3, 8, 4, 3, 19, 1, 9]] 훈련 샘플 문장의 크기 : (3131, 150) 훈련 샘플 레이블의 크기 : (3131, 150, 47) 테스트 샘플 문장의 크기 : (783, 150) 테스트 샘플 레이블의 크기 : (783, 150, 47)
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding
from keras.optimizers import Adam
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(tag_size, activation=('softmax'))))
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=128, epochs=6, validation_data=(X_test, y_test))
Epoch 1/6 25/25 [==============================] - 72s 3s/step - loss: 0.5738 - accuracy: 0.1383 - val_loss: 0.5069 - val_accuracy: 0.1691 Epoch 2/6 25/25 [==============================] - 70s 3s/step - loss: 0.4924 - accuracy: 0.2307 - val_loss: 0.4630 - val_accuracy: 0.3586 Epoch 3/6 25/25 [==============================] - 70s 3s/step - loss: 0.4119 - accuracy: 0.4305 - val_loss: 0.3302 - val_accuracy: 0.5008 Epoch 4/6 25/25 [==============================] - 70s 3s/step - loss: 0.2624 - accuracy: 0.6022 - val_loss: 0.1979 - val_accuracy: 0.7050 Epoch 5/6 25/25 [==============================] - 70s 3s/step - loss: 0.1459 - accuracy: 0.8095 - val_loss: 0.1109 - val_accuracy: 0.8551 Epoch 6/6 25/25 [==============================] - 70s 3s/step - loss: 0.0772 - accuracy: 0.9050 - val_loss: 0.0721 - val_accuracy: 0.8940
<tensorflow.python.keras.callbacks.History at 0x7feb5cf50f98>
print("\n 테스트 정확도: %.4f" % (model.evaluate(X_test, y_test)[1]))
index_to_word=src_tokenizer.index_word
index_to_tag=tar_tokenizer.index_word
i=10 # 확인하고 싶은 테스트용 샘플의 인덱스.
y_predicted = model.predict(np.array([X_test[i]])) # 입력한 테스트용 샘플에 대해서 예측 y를 리턴
y_predicted = np.argmax(y_predicted, axis=-1) # 원-핫 인코딩을 다시 정수 인코딩으로 변경함.
true = np.argmax(y_test[i], -1) # 원-핫 인코딩을 다시 정수 인코딩으로 변경함.
print("{:15}|{:5}|{}".format("단어", "실제값", "예측값"))
print(35 * "-")
for w, t, pred in zip(X_test[i], true, y_predicted[0]):
if w != 0: # PAD값은 제외함.
print("{:17}: {:7} {}".format(index_to_word[w], index_to_tag[t].upper(), index_to_tag[pred].upper()))
25/25 [==============================] - 7s 299ms/step - loss: 0.0721 - accuracy: 0.8940 테스트 정확도: 0.8940 단어 |실제값 |예측값 ----------------------------------- in : IN IN addition : NN NN , : , , buick : NNP NNP is : VBZ VBZ a : DT DT relatively : RB RB respected : VBN VBN nameplate : NN NN among : IN IN american : NNP NNP express : NNP NNP card : NN NN holders : NNS NNS , : , , says : VBZ VBZ 0 : -NONE- -NONE- *t*-1 : -NONE- -NONE- an : DT DT american : NNP NNP express : NNP NNP spokeswoman : NN NN . : . .
aaabdaaabac
-> Z=aa
ZabdZabac
-> Y=ab
ZYdZYac
-> X=ZY
XdXac
# dictionary
# 훈련 데이터에 있는 단어와 등장 빈도수
low : 5, lower : 2, newest : 6, widest : 3
위와 같은 단어:빈도수의 구성을 임의로 딕셔너리라고 했을 때,
이 딕셔너리의 단어집합은 아래와같이 구성됨.
# vocabulary
low, lower, newest, widest
여기에서 lowest라는 단어가 등장하면 OOV 문제가 발생하게 됨.
BPE를 적용하게 되면,
# dictionary
l o w : 5, l o w e r : 2, n e w e s t : 6, w i d e s t : 3
# vocabulary
l, o, w, e, r, n, w, s, t, i, d
# dictionary update!
l o w : 5,
l o w e r : 2,
n e w es t : 6,
w i d es t : 3
# vocabulary update!
l, o, w, e, r, n, w, s, t, i, d, es
# dictionary update!
l o w : 5,
l o w e r : 2,
n e w est : 6,
w i d est : 3
# vocabulary update!
l, o, w, e, r, n, w, s, t, i, d, es, est
# dictionary update!
lo w : 5,
lo w e r : 2,
n e w est : 6,
w i d est : 3
# vocabulary update!
l, o, w, e, r, n, w, s, t, i, d, es, est, lo
...
...
위와 같은 방식으로 10회 반복 후
# dictionary update!
low : 5,
low e r : 2,
newest : 6,
widest : 3
# vocabulary update!
l, o, w, e, r, n, w, s, t, i, d, es, est, lo, low, ne, new, newest, wi, wid, widest
이 경우, lowest란 단어는 OOV가 되지 않음. (low와 est가 단어집합에 있기 때문)
import re, collections
from IPython.display import display, Markdown, Latex
# BPE 실행 횟수
num_merges = 10
dictionary = {'l o w </w>' : 5,
'l o w e r </w>' : 2,
'n e w e s t </w>':6,
'w i d e s t </w>':3
}
def get_stats(dictionary):
# 유니그램의 pair들의 빈도수를 카운트
pairs = collections.defaultdict(int)
for word, freq in dictionary.items():
symbols = word.split()
for i in range(len(symbols)-1):
pairs[symbols[i],symbols[i+1]] += freq
print('현재 pair들의 빈도수 :', dict(pairs))
return pairs
def merge_dictionary(pair, v_in):
v_out = {}
bigram = re.escape(' '.join(pair))
p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
for word in v_in:
w_out = p.sub(''.join(pair), word)
v_out[w_out] = v_in[word]
return v_out
bpe_codes = {}
bpe_codes_reverse = {}
for i in range(num_merges):
display(Markdown("### Iteration {}".format(i + 1)))
pairs = get_stats(dictionary)
best = max(pairs, key=pairs.get)
dictionary = merge_dictionary(best, dictionary)
bpe_codes[best] = i
bpe_codes_reverse[best[0] + best[1]] = best
print("new merge: {}".format(best))
print("dictionary: {}".format(dictionary))
현재 pair들의 빈도수 : {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 8, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('e', 's'): 9, ('s', 't'): 9, ('t', '</w>'): 9, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'e'): 3}
new merge: ('e', 's')
dictionary: {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w es t </w>': 6, 'w i d es t </w>': 3}
현재 pair들의 빈도수 : {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'es'): 6, ('es', 't'): 9, ('t', '</w>'): 9, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'es'): 3}
new merge: ('es', 't')
dictionary: {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est </w>': 6, 'w i d est </w>': 3}
현재 pair들의 빈도수 : {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'est'): 6, ('est', '</w>'): 9, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est'): 3}
new merge: ('est', '</w>')
dictionary: {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
현재 pair들의 빈도수 : {('l', 'o'): 7, ('o', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'est</w>'): 6, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
new merge: ('l', 'o')
dictionary: {'lo w </w>': 5, 'lo w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
현재 pair들의 빈도수 : {('lo', 'w'): 7, ('w', '</w>'): 5, ('w', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'est</w>'): 6, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
new merge: ('lo', 'w')
dictionary: {'low </w>': 5, 'low e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
현재 pair들의 빈도수 : {('low', '</w>'): 5, ('low', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'est</w>'): 6, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
new merge: ('n', 'e')
dictionary: {'low </w>': 5, 'low e r </w>': 2, 'ne w est</w>': 6, 'w i d est</w>': 3}
현재 pair들의 빈도수 : {('low', '</w>'): 5, ('low', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('ne', 'w'): 6, ('w', 'est</w>'): 6, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
new merge: ('ne', 'w')
dictionary: {'low </w>': 5, 'low e r </w>': 2, 'new est</w>': 6, 'w i d est</w>': 3}
현재 pair들의 빈도수 : {('low', '</w>'): 5, ('low', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('new', 'est</w>'): 6, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
new merge: ('new', 'est</w>')
dictionary: {'low </w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'w i d est</w>': 3}
현재 pair들의 빈도수 : {('low', '</w>'): 5, ('low', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
new merge: ('low', '</w>')
dictionary: {'low</w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'w i d est</w>': 3}
현재 pair들의 빈도수 : {('low', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est</w>'): 3}
new merge: ('w', 'i')
dictionary: {'low</w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'wi d est</w>': 3}
print(bpe_codes)
{('e', 's'): 0, ('es', 't'): 1, ('est', '</w>'): 2, ('l', 'o'): 3, ('lo', 'w'): 4, ('n', 'e'): 5, ('ne', 'w'): 6, ('new', 'est</w>'): 7, ('low', '</w>'): 8, ('w', 'i'): 9}
def get_pairs(word):
"""Return set of symbol pairs in a word.
Word is represented as a tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
def encode(orig):
"""Encode word based on list of BPE merge operations, which are applied consecutively"""
word = tuple(orig) + ('</w>',)
display(Markdown("__word split into characters:__ <tt>{}</tt>".format(word)))
pairs = get_pairs(word)
if not pairs:
return orig
iteration = 0
while True:
iteration += 1
display(Markdown("__Iteration {}:__".format(iteration)))
print("bigrams in the word: {}".format(pairs))
bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf')))
print("candidate for merging: {}".format(bigram))
if bigram not in bpe_codes:
display(Markdown("__Candidate not in BPE merges, algorithm stops.__"))
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word)-1 and word[i+1] == second:
new_word.append(first+second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
print("word after merging: {}".format(word))
if len(word) == 1:
break
else:
pairs = get_pairs(word)
# 특별 토큰인 </w>는 출력하지 않는다.
if word[-1] == '</w>':
word = word[:-1]
elif word[-1].endswith('</w>'):
word = word[:-1] + (word[-1].replace('</w>',''),)
return word
encode("loki")
word split into characters: ('l', 'o', 'k', 'i', '')
Iteration 1:
bigrams in the word: {('l', 'o'), ('k', 'i'), ('i', '</w>'), ('o', 'k')}
candidate for merging: ('l', 'o')
word after merging: ('lo', 'k', 'i', '</w>')
Iteration 2:
bigrams in the word: {('lo', 'k'), ('k', 'i'), ('i', '</w>')}
candidate for merging: ('lo', 'k')
Candidate not in BPE merges, algorithm stops.
('lo', 'k', 'i')
encode("lowest")
word split into characters: ('l', 'o', 'w', 'e', 's', 't', '')
Iteration 1:
bigrams in the word: {('s', 't'), ('t', '</w>'), ('o', 'w'), ('w', 'e'), ('l', 'o'), ('e', 's')}
candidate for merging: ('e', 's')
word after merging: ('l', 'o', 'w', 'es', 't', '</w>')
Iteration 2:
bigrams in the word: {('t', '</w>'), ('o', 'w'), ('l', 'o'), ('es', 't'), ('w', 'es')}
candidate for merging: ('es', 't')
word after merging: ('l', 'o', 'w', 'est', '</w>')
Iteration 3:
bigrams in the word: {('l', 'o'), ('est', '</w>'), ('o', 'w'), ('w', 'est')}
candidate for merging: ('est', '</w>')
word after merging: ('l', 'o', 'w', 'est</w>')
Iteration 4:
bigrams in the word: {('w', 'est</w>'), ('l', 'o'), ('o', 'w')}
candidate for merging: ('l', 'o')
word after merging: ('lo', 'w', 'est</w>')
Iteration 5:
bigrams in the word: {('lo', 'w'), ('w', 'est</w>')}
candidate for merging: ('lo', 'w')
word after merging: ('low', 'est</w>')
Iteration 6:
bigrams in the word: {('low', 'est</w>')}
candidate for merging: ('low', 'est</w>')
Candidate not in BPE merges, algorithm stops.
('low', 'est')
encode("lowing")
word split into characters: ('l', 'o', 'w', 'i', 'n', 'g', '')
Iteration 1:
bigrams in the word: {('n', 'g'), ('w', 'i'), ('i', 'n'), ('o', 'w'), ('l', 'o'), ('g', '</w>')}
candidate for merging: ('l', 'o')
word after merging: ('lo', 'w', 'i', 'n', 'g', '</w>')
Iteration 2:
bigrams in the word: {('n', 'g'), ('w', 'i'), ('i', 'n'), ('lo', 'w'), ('g', '</w>')}
candidate for merging: ('lo', 'w')
word after merging: ('low', 'i', 'n', 'g', '</w>')
Iteration 3:
bigrams in the word: {('n', 'g'), ('i', 'n'), ('g', '</w>'), ('low', 'i')}
candidate for merging: ('n', 'g')
Candidate not in BPE merges, algorithm stops.
('low', 'i', 'n', 'g')
import pandas as pd
import urllib3
import zipfile
import shutil
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
http = urllib3.PoolManager()
url ='http://www.manythings.org/anki/fra-eng.zip'
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename = os.path.join(path, filename)
with http.request('GET', url, preload_content=False) as r, open(zipfilename, 'wb') as out_file:
shutil.copyfileobj(r, out_file)
with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
zip_ref.extractall(path)
lines= pd.read_csv('fra.txt', names=['src', 'tar'], sep='\t')
len(lines)
178009
lines = lines.loc[:, 'src':'tar']
lines = lines[0:60000] # 6만개만 저장
lines.sample(10)
| src | tar | |
|---|---|---|
| Unlock the door. | Déverrouille la porte. | CC-BY 2.0 (France) Attribution: tatoeba.org #3... |
| I want to thank you. | Je veux te remercier. | CC-BY 2.0 (France) Attribution: tatoeba.org #2... |
| They formed a circle. | Ils formèrent un cercle. | CC-BY 2.0 (France) Attribution: tatoeba.org #1... |
| She became a nurse. | Elle devint infirmière. | CC-BY 2.0 (France) Attribution: tatoeba.org #5... |
| No one was alive. | Personne n'était vivant. | CC-BY 2.0 (France) Attribution: tatoeba.org #2... |
| What a pretty girl! | Quelle jolie fille ! | CC-BY 2.0 (France) Attribution: tatoeba.org #2... |
| I can't stop writing. | Je ne peux pas m’arrêter d'écrire. | CC-BY 2.0 (France) Attribution: tatoeba.org #1... |
| I love the way you kiss. | J'adore la façon que vous avez d'embrasser. | CC-BY 2.0 (France) Attribution: tatoeba.org #2... |
| Did you question them? | Les avez-vous remis en question ? | CC-BY 2.0 (France) Attribution: tatoeba.org #3... |
| You may be needed. | On pourrait avoir besoin de toi. | CC-BY 2.0 (France) Attribution: tatoeba.org #3... |
lines.tar = lines.tar.apply(lambda x : '\t '+ x + ' \n') # \t: <sos> (시작심볼) \n: <eos> (종료심볼)
lines.sample(10)
| src | tar | |
|---|---|---|
| Explain it to me. | Expliquez-le-moi. | \t \t CC-BY 2.0 (France) Attribution: tatoeba.... |
| Come to me. | Venez à moi. | \t \t CC-BY 2.0 (France) Attribution: tatoeba.... |
| We'll wait here. | Nous attendrons ici. | \t \t CC-BY 2.0 (France) Attribution: tatoeba.... |
| Hey, I want to help you. | Hé, je veux vous aider. | \t \t CC-BY 2.0 (France) Attribution: tatoeba.... |
| I rang the doorbell. | J'ai sonné à la porte. | \t \t CC-BY 2.0 (France) Attribution: tatoeba.... |
| Tom wore a straw hat. | Tom portait un chapeau de paille. | \t \t CC-BY 2.0 (France) Attribution: tatoeba.... |
| It was his best time. | Ça a été son meilleur temps. | \t \t CC-BY 2.0 (France) Attribution: tatoeba.... |
| Thanks for the tea. | Merci pour le thé. | \t \t CC-BY 2.0 (France) Attribution: tatoeba.... |
| We can live with that. | Nous pouvons vivre avec ça. | \t \t CC-BY 2.0 (France) Attribution: tatoeba.... |
| This isn't enough. | Ça ne suffit pas. | \t \t CC-BY 2.0 (France) Attribution: tatoeba.... |
# 글자 집합 구축
src_vocab=set()
for line in lines.src: # 1줄씩 읽음
for char in line: # 1개의 글자씩 읽음
src_vocab.add(char)
tar_vocab=set()
for line in lines.tar:
for char in line:
tar_vocab.add(char)
src_vocab_size = len(src_vocab)+1
tar_vocab_size = len(tar_vocab)+1
print(src_vocab_size)
print(tar_vocab_size)
104 74
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
print(src_vocab[45:75])
print(tar_vocab[45:75])
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])
print(src_to_index)
print(tar_to_index)
encoder_input = []
for line in lines.src: #입력 데이터에서 1줄씩 문장을 읽음
temp_X = []
for w in line: #각 줄에서 1개씩 글자를 읽음
temp_X.append(src_to_index[w]) # 글자를 해당되는 정수로 변환
encoder_input.append(temp_X)
print(encoder_input[:5])
['V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']
['Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
{' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9, ',': 10, '-': 11, '.': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, '?': 24, 'A': 25, 'B': 26, 'C': 27, 'D': 28, 'E': 29, 'F': 30, 'G': 31, 'H': 32, 'I': 33, 'J': 34, 'K': 35, 'L': 36, 'M': 37, 'N': 38, 'O': 39, 'P': 40, 'Q': 41, 'R': 42, 'S': 43, 'T': 44, 'U': 45, 'V': 46, 'W': 47, 'X': 48, 'Y': 49, 'Z': 50, 'a': 51, 'b': 52, 'c': 53, 'd': 54, 'e': 55, 'f': 56, 'g': 57, 'h': 58, 'i': 59, 'j': 60, 'k': 61, 'l': 62, 'm': 63, 'n': 64, 'o': 65, 'p': 66, 'q': 67, 'r': 68, 's': 69, 't': 70, 'u': 71, 'v': 72, 'w': 73, 'x': 74, 'y': 75, 'z': 76, '\xa0': 77, '«': 78, '»': 79, 'À': 80, 'Ç': 81, 'É': 82, 'Ê': 83, 'Ô': 84, 'à': 85, 'â': 86, 'ç': 87, 'è': 88, 'é': 89, 'ê': 90, 'ë': 91, 'î': 92, 'ï': 93, 'ô': 94, 'ù': 95, 'û': 96, 'œ': 97, 'С': 98, '\u2009': 99, '\u200b': 100, '‘': 101, '’': 102, '\u202f': 103}
{'\t': 1, '\n': 2, ' ': 3, '#': 4, '&': 5, '(': 6, ')': 7, '-': 8, '.': 9, '0': 10, '1': 11, '2': 12, '3': 13, '4': 14, '5': 15, '6': 16, '7': 17, '8': 18, '9': 19, ':': 20, 'A': 21, 'B': 22, 'C': 23, 'D': 24, 'E': 25, 'F': 26, 'G': 27, 'H': 28, 'I': 29, 'J': 30, 'K': 31, 'L': 32, 'M': 33, 'N': 34, 'O': 35, 'P': 36, 'Q': 37, 'R': 38, 'S': 39, 'T': 40, 'U': 41, 'V': 42, 'W': 43, 'X': 44, 'Y': 45, 'Z': 46, '_': 47, 'a': 48, 'b': 49, 'c': 50, 'd': 51, 'e': 52, 'f': 53, 'g': 54, 'h': 55, 'i': 56, 'j': 57, 'k': 58, 'l': 59, 'm': 60, 'n': 61, 'o': 62, 'p': 63, 'q': 64, 'r': 65, 's': 66, 't': 67, 'u': 68, 'v': 69, 'w': 70, 'x': 71, 'y': 72, 'z': 73}
[[46, 51, 1, 2], [43, 51, 62, 71, 70, 1, 2], [43, 51, 62, 71, 70, 12], [27, 65, 71, 68, 69, 103, 2], [27, 65, 71, 68, 55, 76, 103, 2]]
decoder_input = []
for line in lines.tar:
temp_X = []
for w in line:
temp_X.append(tar_to_index[w])
decoder_input.append(temp_X)
print(decoder_input[:5])
# <sos> 제거
decoder_target = []
for line in lines.tar:
t=0
temp_X = []
for w in line:
if t>0:
temp_X.append(tar_to_index[w])
t=t+1
decoder_target.append(temp_X)
print(decoder_target[:5])
[[1, 3, 1, 3, 23, 23, 8, 22, 45, 3, 12, 9, 10, 3, 6, 26, 65, 48, 61, 50, 52, 7, 3, 21, 67, 67, 65, 56, 49, 68, 67, 56, 62, 61, 20, 3, 67, 48, 67, 62, 52, 49, 48, 9, 62, 65, 54, 3, 4, 12, 18, 17, 17, 12, 17, 12, 3, 6, 23, 33, 7, 3, 5, 3, 4, 11, 11, 15, 18, 12, 15, 10, 3, 6, 43, 56, 67, 67, 72, 51, 52, 69, 7, 3, 2, 3, 2], [1, 3, 1, 3, 23, 23, 8, 22, 45, 3, 12, 9, 10, 3, 6, 26, 65, 48, 61, 50, 52, 7, 3, 21, 67, 67, 65, 56, 49, 68, 67, 56, 62, 61, 20, 3, 67, 48, 67, 62, 52, 49, 48, 9, 62, 65, 54, 3, 4, 15, 13, 18, 11, 12, 13, 3, 6, 23, 33, 7, 3, 5, 3, 4, 15, 10, 19, 18, 11, 19, 3, 6, 21, 56, 57, 56, 7, 3, 2, 3, 2], [1, 3, 1, 3, 23, 23, 8, 22, 45, 3, 12, 9, 10, 3, 6, 26, 65, 48, 61, 50, 52, 7, 3, 21, 67, 67, 65, 56, 49, 68, 67, 56, 62, 61, 20, 3, 67, 48, 67, 62, 52, 49, 48, 9, 62, 65, 54, 3, 4, 15, 13, 18, 11, 12, 13, 3, 6, 23, 33, 7, 3, 5, 3, 4, 14, 13, 12, 10, 14, 16, 12, 3, 6, 54, 56, 59, 59, 68, 71, 7, 3, 2, 3, 2], [1, 3, 1, 3, 23, 23, 8, 22, 45, 3, 12, 9, 10, 3, 6, 26, 65, 48, 61, 50, 52, 7, 3, 21, 67, 67, 65, 56, 49, 68, 67, 56, 62, 61, 20, 3, 67, 48, 67, 62, 52, 49, 48, 9, 62, 65, 54, 3, 4, 19, 10, 16, 13, 12, 18, 3, 6, 63, 48, 63, 48, 49, 52, 48, 65, 7, 3, 5, 3, 4, 19, 10, 16, 13, 13, 11, 3, 6, 66, 48, 50, 65, 52, 51, 50, 52, 59, 67, 56, 50, 7, 3, 2, 3, 2], [1, 3, 1, 3, 23, 23, 8, 22, 45, 3, 12, 9, 10, 3, 6, 26, 65, 48, 61, 50, 52, 7, 3, 21, 67, 67, 65, 56, 49, 68, 67, 56, 62, 61, 20, 3, 67, 48, 67, 62, 52, 49, 48, 9, 62, 65, 54, 3, 4, 19, 10, 16, 13, 12, 18, 3, 6, 63, 48, 63, 48, 49, 52, 48, 65, 7, 3, 5, 3, 4, 19, 10, 16, 13, 13, 12, 3, 6, 66, 48, 50, 65, 52, 51, 50, 52, 59, 67, 56, 50, 7, 3, 2, 3, 2]] [[3, 1, 3, 23, 23, 8, 22, 45, 3, 12, 9, 10, 3, 6, 26, 65, 48, 61, 50, 52, 7, 3, 21, 67, 67, 65, 56, 49, 68, 67, 56, 62, 61, 20, 3, 67, 48, 67, 62, 52, 49, 48, 9, 62, 65, 54, 3, 4, 12, 18, 17, 17, 12, 17, 12, 3, 6, 23, 33, 7, 3, 5, 3, 4, 11, 11, 15, 18, 12, 15, 10, 3, 6, 43, 56, 67, 67, 72, 51, 52, 69, 7, 3, 2, 3, 2], [3, 1, 3, 23, 23, 8, 22, 45, 3, 12, 9, 10, 3, 6, 26, 65, 48, 61, 50, 52, 7, 3, 21, 67, 67, 65, 56, 49, 68, 67, 56, 62, 61, 20, 3, 67, 48, 67, 62, 52, 49, 48, 9, 62, 65, 54, 3, 4, 15, 13, 18, 11, 12, 13, 3, 6, 23, 33, 7, 3, 5, 3, 4, 15, 10, 19, 18, 11, 19, 3, 6, 21, 56, 57, 56, 7, 3, 2, 3, 2], [3, 1, 3, 23, 23, 8, 22, 45, 3, 12, 9, 10, 3, 6, 26, 65, 48, 61, 50, 52, 7, 3, 21, 67, 67, 65, 56, 49, 68, 67, 56, 62, 61, 20, 3, 67, 48, 67, 62, 52, 49, 48, 9, 62, 65, 54, 3, 4, 15, 13, 18, 11, 12, 13, 3, 6, 23, 33, 7, 3, 5, 3, 4, 14, 13, 12, 10, 14, 16, 12, 3, 6, 54, 56, 59, 59, 68, 71, 7, 3, 2, 3, 2], [3, 1, 3, 23, 23, 8, 22, 45, 3, 12, 9, 10, 3, 6, 26, 65, 48, 61, 50, 52, 7, 3, 21, 67, 67, 65, 56, 49, 68, 67, 56, 62, 61, 20, 3, 67, 48, 67, 62, 52, 49, 48, 9, 62, 65, 54, 3, 4, 19, 10, 16, 13, 12, 18, 3, 6, 63, 48, 63, 48, 49, 52, 48, 65, 7, 3, 5, 3, 4, 19, 10, 16, 13, 13, 11, 3, 6, 66, 48, 50, 65, 52, 51, 50, 52, 59, 67, 56, 50, 7, 3, 2, 3, 2], [3, 1, 3, 23, 23, 8, 22, 45, 3, 12, 9, 10, 3, 6, 26, 65, 48, 61, 50, 52, 7, 3, 21, 67, 67, 65, 56, 49, 68, 67, 56, 62, 61, 20, 3, 67, 48, 67, 62, 52, 49, 48, 9, 62, 65, 54, 3, 4, 19, 10, 16, 13, 12, 18, 3, 6, 63, 48, 63, 48, 49, 52, 48, 65, 7, 3, 5, 3, 4, 19, 10, 16, 13, 13, 12, 3, 6, 66, 48, 50, 65, 52, 51, 50, 52, 59, 67, 56, 50, 7, 3, 2, 3, 2]]
max_src_len = max([len(line) for line in lines.src])
max_tar_len = max([len(line) for line in lines.tar])
print(max_src_len)
print(max_tar_len)
encoder_input = pad_sequences(encoder_input, maxlen=max_src_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen=max_tar_len, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=max_tar_len, padding='post')
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)
72 110
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
import numpy as np
encoder_inputs = Input(shape=(None, src_vocab_size))
encoder_lstm = LSTM(units=256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
# encoder_outputs도 같이 리턴받기는 했지만 여기서는 필요없으므로 이 값은 버림.
encoder_states = [state_h, state_c]
# LSTM은 바닐라 RNN과는 달리 상태가 두 개. 바로 은닉 상태와 셀 상태.
decoder_inputs = Input(shape=(None, tar_vocab_size))
decoder_lstm = LSTM(units=256, return_sequences=True, return_state=True)
decoder_outputs, _, _= decoder_lstm(decoder_inputs, initial_state=encoder_states)
# 디코더의 첫 상태를 인코더의 은닉 상태, 셀 상태로 합니다.
decoder_softmax_layer = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy")
model.fit(x=[encoder_input, decoder_input], y=decoder_target, batch_size=64, epochs=50, validation_split=0.2)
encoder_model = Model(inputs=encoder_inputs, outputs=encoder_states)
# 이전 시점의 상태들을 저장하는 텐서
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
# 문장의 다음 단어를 예측하기 위해서 초기 상태(initial_state)를 이전 시점의 상태로 사용. 이는 뒤의 함수 decode_sequence()에 구현
decoder_states = [state_h, state_c]
# 훈련 과정에서와 달리 LSTM의 리턴하는 은닉 상태와 셀 상태인 state_h와 state_c를 버리지 않음.
decoder_outputs = decoder_softmax_layer(decoder_outputs)
decoder_model = Model(inputs=[decoder_inputs] + decoder_states_inputs, outputs=[decoder_outputs] + decoder_states)
index_to_src = dict((i, char) for char, i in src_to_index.items())
index_to_tar = dict((i, char) for char, i in tar_to_index.items())
def decode_sequence(input_seq):
# 입력으로부터 인코더의 상태를 얻음
states_value = encoder_model.predict(input_seq)
# <SOS>에 해당하는 원-핫 벡터 생성
target_seq = np.zeros((1, 1, tar_vocab_size))
target_seq[0, 0, tar_to_index['\t']] = 1.
stop_condition = False
decoded_sentence = ""
# stop_condition이 True가 될 때까지 루프 반복
while not stop_condition:
# 이점 시점의 상태 states_value를 현 시점의 초기 상태로 사용
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
# 예측 결과를 문자로 변환
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = index_to_tar[sampled_token_index]
# 현재 시점의 예측 문자를 예측 문장에 추가
decoded_sentence += sampled_char
# <eos>에 도달하거나 최대 길이를 넘으면 중단.
if (sampled_char == '\n' or
len(decoded_sentence) > max_tar_len):
stop_condition = True
# 현재 시점의 예측 결과를 다음 시점의 입력으로 사용하기 위해 저장
target_seq = np.zeros((1, 1, tar_vocab_size))
target_seq[0, 0, sampled_token_index] = 1.
# 현재 시점의 상태를 다음 시점의 상태로 사용하기 위해 저장
states_value = [h, c]
return decoded_sentence
for seq_index in [3,50,100,300,1001]: # 입력 문장의 인덱스
input_seq = encoder_input[seq_index: seq_index + 1]
decoded_sentence = decode_sequence(input_seq)
print(35 * "-")
print('입력 문장:', lines.src[seq_index])
print('정답 문장:', lines.tar[seq_index][1:len(lines.tar[seq_index])-1]) # '\t'와 '\n'을 빼고 출력
print('번역기가 번역한 문장:', decoded_sentence[:len(decoded_sentence)-1]) # '\n'을 빼고 출력
주대각선의 원소가 1이고 나머지 원소는 모두 0인 정사각 행렬
주대각선을 제외한 곳의 원소가 모두 0인 행렬
SVD를 나온 대각행렬의 주대각 원소를 행렬 A의 특이값이라고 하며, 특이값들은 내림차순으로 정렬되어 있음.
ex) 아래 사진에서 12.4 -> 9.5 -> 1.3의 순으로 정렬되어 있음.
위의 SVD를 Full SVD라고 하는데, LSA에서는 Full SVD로 나온 3개의 행렬에서 일부 벡터를 삭제한 후 절단된 SVD를 사용하게 됨.
대각행렬의 대각 원소의 값 중에서 상위값 t개만 남긴다.
import numpy as np
A=np.array([[0,0,0,1,0,1,1,0,0],[0,0,0,1,1,0,1,0,0],[0,1,1,0,2,0,0,0,0],[1,0,0,0,0,0,0,1,1]])
np.shape(A)
(4, 9)
# FULL SVD
U, s, VT = np.linalg.svd(A, full_matrices = True)
print(U.round(2)) # 소수점 두 번째 자리까지만 출력
np.shape(U)
[[-0.24 0.75 0. -0.62] [-0.51 0.44 -0. 0.74] [-0.83 -0.49 -0. -0.27] [-0. -0. 1. 0. ]]
(4, 4)
print(s.round(2))
np.shape(s)
[2.69 2.05 1.73 0.77]
(4,)
# 대각행렬 S
S = np.zeros((4, 9)) # 대각 행렬의 크기인 4 x 9의 임의의 행렬 생성
S[:4, :4] = np.diag(s) # 특이값을 대각행렬에 삽입
print(S.round(2))
np.shape(S)
[[2.69 0. 0. 0. 0. 0. 0. 0. 0. ] [0. 2.05 0. 0. 0. 0. 0. 0. 0. ] [0. 0. 1.73 0. 0. 0. 0. 0. 0. ] [0. 0. 0. 0.77 0. 0. 0. 0. 0. ]]
(4, 9)
# 직교행렬 VT (V의 전치행렬)
print(VT.round(2))
np.shape(VT)
[[-0. -0.31 -0.31 -0.28 -0.8 -0.09 -0.28 -0. -0. ] [ 0. -0.24 -0.24 0.58 -0.26 0.37 0.58 -0. -0. ] [ 0.58 -0. 0. 0. -0. 0. -0. 0.58 0.58] [ 0. -0.35 -0.35 0.16 0.25 -0.8 0.16 -0. -0. ] [-0. -0.78 -0.01 -0.2 0.4 0.4 -0.2 0. 0. ] [-0.29 0.31 -0.78 -0.24 0.23 0.23 0.01 0.14 0.14] [-0.29 -0.1 0.26 -0.59 -0.08 -0.08 0.66 0.14 0.14] [-0.5 -0.06 0.15 0.24 -0.05 -0.05 -0.19 0.75 -0.25] [-0.5 -0.06 0.15 0.24 -0.05 -0.05 -0.19 -0.25 0.75]]
(9, 9)
# allclose(): 두 개의 행렬이 동일하면 True
np.allclose(A, np.dot(np.dot(U,S), VT).round(2))
True
# 절단된 SVD
S=S[:2,:2]
print(S.round(2))
U=U[:,:2]
print(U.round(2))
VT=VT[:2,:]
print(VT.round(2))
[[2.69 0. ] [0. 2.05]] [[-0.24 0.75] [-0.51 0.44] [-0.83 -0.49] [-0. -0. ]] [[-0. -0.31 -0.31 -0.28 -0.8 -0.09 -0.28 -0. -0. ] [ 0. -0.24 -0.24 0.58 -0.26 0.37 0.58 -0. -0. ]]
A_prime=np.dot(np.dot(U,S), VT)
print(A) # 기존행렬
print(A_prime.round(2)) # 절단된 SVD
[[0 0 0 1 0 1 1 0 0] [0 0 0 1 1 0 1 0 0] [0 1 1 0 2 0 0 0 0] [1 0 0 0 0 0 0 1 1]] [[ 0. -0.17 -0.17 1.08 0.12 0.62 1.08 -0. -0. ] [ 0. 0.2 0.2 0.91 0.86 0.45 0.91 0. 0. ] [ 0. 0.93 0.93 0.03 2.05 -0.17 0.03 0. 0. ] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. ]]
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
len(documents)
Downloading 20news dataset. This may take a few minutes. Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
11314
documents[1]
"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism? No, you need a little leap of faith, Jimmy. Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim. And I'm sorry that you have these feelings of\ndenial about the faith you need to get by. Oh well, just pretend that it will\nall end happily ever after anyway. Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim. Don't forget your Flintstone's Chewables! :) \n--\nBake Timmons, III"
# 뉴스그룹 데이터가 가지고 있는 20개의 카테고리
print(dataset.target_names)
print(len(dataset.target_names))
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] 20
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())
news_df['clean_doc'][1]
'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english') # NLTK로부터 불용어를 받아옵니다.
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# 불용어를 제거합니다.
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
print(tokenized_doc[1])
['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']
# 역토큰화 (토큰화 작업을 역으로 되돌림)
detokenized_doc = []
for i in range(len(news_df)):
t = ' '.join(tokenized_doc[i])
detokenized_doc.append(t)
news_df['clean_doc'] = detokenized_doc
news_df['clean_doc'][1]
'yeah expect people read actually accept hard atheism need little leap faith jimmy logic runs steam sorry pity sorry feelings denial faith need well pretend happily ever anyway maybe start newsgroup atheist hard bummin much forget flintstone chewables bake timmons'
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english',
max_features= 1000, # 상위 1,000개의 단어를 보존
max_df = 0.5,
smooth_idf=True)
X = vectorizer.fit_transform(news_df['clean_doc'])
X.shape # TF-IDF 행렬의 크기 확인
(11314, 1000)
from sklearn.decomposition import TruncatedSVD
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)
svd_model.fit(X)
len(svd_model.components_)
20
np.shape(svd_model.components_) # svd_model.components_ : VT
(20, 1000)
terms = vectorizer.get_feature_names() # 단어 집합. 1,000개의 단어가 저장됨.
# (20, 1000) 행렬에서 가장 값이 큰 5개의 값을 찾아서 단어로 출력함.
def get_topics(components, feature_names, n=5):
for idx, topic in enumerate(components):
print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(svd_model.components_,terms)
Topic 1: [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128)]
Topic 2: [('thanks', 0.32888), ('windows', 0.29088), ('card', 0.18069), ('drive', 0.17455), ('mail', 0.15111)]
Topic 3: [('game', 0.37064), ('team', 0.32443), ('year', 0.28154), ('games', 0.2537), ('season', 0.18419)]
Topic 4: [('drive', 0.53324), ('scsi', 0.20165), ('hard', 0.15628), ('disk', 0.15578), ('card', 0.13994)]
Topic 5: [('windows', 0.40399), ('file', 0.25436), ('window', 0.18044), ('files', 0.16078), ('program', 0.13894)]
Topic 6: [('chip', 0.16114), ('government', 0.16009), ('mail', 0.15625), ('space', 0.1507), ('information', 0.13562)]
Topic 7: [('like', 0.67086), ('bike', 0.14236), ('chip', 0.11169), ('know', 0.11139), ('sounds', 0.10371)]
Topic 8: [('card', 0.46633), ('video', 0.22137), ('sale', 0.21266), ('monitor', 0.15463), ('offer', 0.14643)]
Topic 9: [('know', 0.46047), ('card', 0.33605), ('chip', 0.17558), ('government', 0.1522), ('video', 0.14356)]
Topic 10: [('good', 0.42756), ('know', 0.23039), ('time', 0.1882), ('bike', 0.11406), ('jesus', 0.09027)]
Topic 11: [('think', 0.78469), ('chip', 0.10899), ('good', 0.10635), ('thanks', 0.09123), ('clipper', 0.07946)]
Topic 12: [('thanks', 0.36824), ('good', 0.22729), ('right', 0.21559), ('bike', 0.21037), ('problem', 0.20894)]
Topic 13: [('good', 0.36212), ('people', 0.33985), ('windows', 0.28385), ('know', 0.26232), ('file', 0.18422)]
Topic 14: [('space', 0.39946), ('think', 0.23258), ('know', 0.18074), ('nasa', 0.15174), ('problem', 0.12957)]
Topic 15: [('space', 0.31613), ('good', 0.3094), ('card', 0.22603), ('people', 0.17476), ('time', 0.14496)]
Topic 16: [('people', 0.48156), ('problem', 0.19961), ('window', 0.15281), ('time', 0.14664), ('game', 0.12871)]
Topic 17: [('time', 0.34465), ('bike', 0.27303), ('right', 0.25557), ('windows', 0.1997), ('file', 0.19118)]
Topic 18: [('time', 0.5973), ('problem', 0.15504), ('file', 0.14956), ('think', 0.12847), ('israel', 0.10903)]
Topic 19: [('file', 0.44163), ('need', 0.26633), ('card', 0.18388), ('files', 0.17453), ('right', 0.15448)]
Topic 20: [('problem', 0.33006), ('file', 0.27651), ('thanks', 0.23578), ('used', 0.19206), ('space', 0.13185)]
문서1 : 저는 사과랑 바나나를 먹어요
문서2 : 우리는 귀여운 강아지가 좋아요
문서3 : 저의 깜찍하고 귀여운 강아지가 바나나를 먹어요
LDA는 각 문서의 토픽분포와 각 토픽 내의 단어분포를 추정함.
<각 문서의 토픽 분포>
문서1 : 토픽 A 100%
문서2 : 토픽 B 100%
문서3 : 토픽 B 60%, 토픽 A 40%
<각 토픽의 단어 분포>
토픽A : 사과 20%, 바나나 40%, 먹어요 40%, 귀여운 0%, 강아지 0%, 깜찍하고 0%, 좋아요 0%
토픽B : 사과 0%, 바나나 0%, 먹어요 0%, 귀여운 33%, 강아지 33%, 깜찍하고 16%, 좋아요 16%
토픽의 제목을 정하진 않았지만, 과일에 대한 토픽과 강아지에 대한 토픽으로 나뉘었다고 볼 수 있음.
BoW의 DTM이나 TF-IDF를 입력으로 받는데, 즉 단어 순서를 신경쓰지 않겠다는 것임.
문서 작성자가 '나는 이 문서를 작성하기 위해 이런 주제들을 넣고, 이런 주제들을 위해서는 이런 단어들을 넣을 것이다.' 라는 생각을 한 후에 다음과 같은 과정을 거쳐서 작성되었다고 가정한다.
이런 과정을 통해 문서가 작성되었다는 가정 하에 LDA는 토픽을 뽑아내기 위해 위 과정을 역으로 추적하는 역공학(reverse engineering)을 수행한다.
1) 사용자는 알고리즘에게 토픽의 개수 k를 알려줌. 2) 모든 단어를 k개 중 하나의 토픽에 할당한다. 3) 모든 문서의 모든 단어에 대해 아래 사항을 반복함. 3-1) 어떤 문서의 각 단어 w는 잘못된 토픽에 할당되어져 있지만, 다른 단어들은 전부 올바른 토픽에 할당되어져 있는 상태라고 가정함.
p (topic t | document d): 문서 d의 단어들 중 토픽 t에 해당하는 단어들의 비율
p (word w | topic t): 단어 w를 갖고 있는 문서들 중 토픽 t가 할당된 비율
LSA : DTM을 차원 축소 하여 축소 차원에서 근접 단어들을 토픽으로 묶는다.
LDA : 단어가 특정 토픽에 존재할 확률과 문서에 특정 토픽이 존재할 확률을 결합확률로 추정하여 토픽을 추출한다.
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
len(documents)
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english') # NLTK로부터 불용어를 받아옵니다.
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# 불용어를 제거합니다.
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Package stopwords is already up-to-date!
tokenized_doc[:5]
0 [well, sure, story, seem, biased, disagree, st... 1 [yeah, expect, people, read, actually, accept,... 2 [although, realize, principle, strongest, poin... 3 [notwithstanding, legitimate, fuss, proposal, ... 4 [well, change, scoring, playoff, pool, unfortu... Name: clean_doc, dtype: object
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[1]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0
# (정수인코딩한 단어의 인덱스, 그 단어가 등장한 빈도)
[(52, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1)]
print(dictionary[66])
faith
len(dictionary)
64281
import gensim
NUM_TOPICS = 20 #20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
print(topic)
(0, '0.005*"coli" + 0.005*"weaver" + 0.005*"francis" + 0.004*"mark"') (1, '0.012*"drive" + 0.010*"problem" + 0.009*"would" + 0.009*"like"') (2, '0.013*"price" + 0.013*"sale" + 0.010*"offer" + 0.010*"shipping"') (3, '0.010*"nrhj" + 0.007*"wwiz" + 0.006*"bxom" + 0.006*"gizw"') (4, '0.010*"president" + 0.006*"american" + 0.006*"year" + 0.005*"think"') (5, '0.011*"available" + 0.010*"mail" + 0.009*"file" + 0.009*"information"') (6, '0.012*"medical" + 0.010*"health" + 0.009*"disease" + 0.008*"pain"') (7, '0.023*"game" + 0.022*"team" + 0.018*"games" + 0.016*"play"') (8, '0.010*"would" + 0.010*"people" + 0.006*"think" + 0.005*"many"') (9, '0.015*"char" + 0.015*"remark" + 0.009*"islanders" + 0.008*"kent"') (10, '0.010*"food" + 0.006*"smokeless" + 0.005*"billion" + 0.004*"chinese"') (11, '0.008*"printer" + 0.007*"print" + 0.005*"borland" + 0.004*"engine"') (12, '0.014*"would" + 0.011*"like" + 0.009*"know" + 0.008*"time"') (13, '0.014*"bike" + 0.010*"printf" + 0.008*"cars" + 0.007*"engine"') (14, '0.020*"file" + 0.016*"window" + 0.015*"entry" + 0.014*"output"') (15, '0.008*"ground" + 0.007*"wire" + 0.006*"power" + 0.005*"current"') (16, '0.011*"armenian" + 0.010*"israel" + 0.010*"armenians" + 0.009*"jews"') (17, '0.012*"chip" + 0.011*"keys" + 0.009*"data" + 0.009*"number"') (18, '0.036*"space" + 0.015*"nasa" + 0.009*"center" + 0.008*"launch"') (19, '0.012*"government" + 0.008*"public" + 0.006*"would" + 0.006*"security"')
# 각 토픽별 10개 단어씩 출력
print(ldamodel.print_topics())
[(0, '0.005*"coli" + 0.005*"weaver" + 0.005*"francis" + 0.004*"mark" + 0.004*"maine" + 0.004*"andrew" + 0.004*"cleveland" + 0.004*"acid" + 0.004*"steve" + 0.004*"finals"'), (1, '0.012*"drive" + 0.010*"problem" + 0.009*"would" + 0.009*"like" + 0.009*"system" + 0.009*"windows" + 0.009*"card" + 0.008*"know" + 0.007*"disk" + 0.007*"scsi"'), (2, '0.013*"price" + 0.013*"sale" + 0.010*"offer" + 0.010*"shipping" + 0.009*"condition" + 0.008*"sell" + 0.008*"asking" + 0.007*"best" + 0.006*"excellent" + 0.005*"used"'), (3, '0.010*"nrhj" + 0.007*"wwiz" + 0.006*"bxom" + 0.006*"gizw" + 0.005*"tbxn" + 0.005*"bhjn" + 0.005*"bxlt" + 0.004*"wmbxn" + 0.004*"nriz" + 0.004*"pnei"'), (4, '0.010*"president" + 0.006*"american" + 0.006*"year" + 0.005*"think" + 0.005*"going" + 0.005*"people" + 0.005*"april" + 0.005*"money" + 0.005*"work" + 0.005*"would"'), (5, '0.011*"available" + 0.010*"mail" + 0.009*"file" + 0.009*"information" + 0.008*"also" + 0.008*"files" + 0.008*"software" + 0.008*"please" + 0.007*"version" + 0.007*"send"'), (6, '0.012*"medical" + 0.010*"health" + 0.009*"disease" + 0.008*"pain" + 0.007*"patients" + 0.005*"doctor" + 0.005*"study" + 0.005*"cancer" + 0.004*"cause" + 0.004*"diseases"'), (7, '0.023*"game" + 0.022*"team" + 0.018*"games" + 0.016*"play" + 0.014*"season" + 0.012*"players" + 0.011*"hockey" + 0.010*"period" + 0.010*"league" + 0.008*"teams"'), (8, '0.010*"would" + 0.010*"people" + 0.006*"think" + 0.005*"many" + 0.005*"believe" + 0.005*"jesus" + 0.005*"even" + 0.004*"know" + 0.004*"also" + 0.004*"like"'), (9, '0.015*"char" + 0.015*"remark" + 0.009*"islanders" + 0.008*"kent" + 0.007*"mask" + 0.006*"argv" + 0.006*"hawks" + 0.006*"graphic" + 0.005*"devils" + 0.005*"award"'), (10, '0.010*"food" + 0.006*"smokeless" + 0.005*"billion" + 0.004*"chinese" + 0.004*"dept" + 0.004*"term" + 0.003*"contacts" + 0.003*"high" + 0.003*"outbreak" + 0.003*"option"'), (11, '0.008*"printer" + 0.007*"print" + 0.005*"borland" + 0.004*"engine" + 0.004*"year" + 0.004*"liar" + 0.004*"laser" + 0.004*"yankees" + 0.003*"exhaust" + 0.003*"printing"'), (12, '0.014*"would" + 0.011*"like" + 0.009*"know" + 0.008*"time" + 0.008*"think" + 0.008*"good" + 0.007*"people" + 0.007*"could" + 0.007*"well" + 0.006*"back"'), (13, '0.014*"bike" + 0.010*"printf" + 0.008*"cars" + 0.007*"engine" + 0.006*"road" + 0.006*"miles" + 0.006*"ride" + 0.005*"front" + 0.005*"riding" + 0.004*"motorcycle"'), (14, '0.020*"file" + 0.016*"window" + 0.015*"entry" + 0.014*"output" + 0.014*"program" + 0.009*"motif" + 0.009*"widget" + 0.007*"build" + 0.007*"line" + 0.007*"server"'), (15, '0.008*"ground" + 0.007*"wire" + 0.006*"power" + 0.005*"current" + 0.004*"circuit" + 0.004*"high" + 0.004*"wiring" + 0.004*"theory" + 0.004*"output" + 0.003*"neutral"'), (16, '0.011*"armenian" + 0.010*"israel" + 0.010*"armenians" + 0.009*"jews" + 0.008*"turkish" + 0.008*"people" + 0.006*"israeli" + 0.005*"said" + 0.005*"turkey" + 0.005*"killed"'), (17, '0.012*"chip" + 0.011*"keys" + 0.009*"data" + 0.009*"number" + 0.009*"bits" + 0.007*"ripem" + 0.007*"algorithm" + 0.006*"used" + 0.006*"serial" + 0.005*"using"'), (18, '0.036*"space" + 0.015*"nasa" + 0.009*"center" + 0.008*"launch" + 0.007*"satellite" + 0.007*"earth" + 0.006*"research" + 0.006*"shuttle" + 0.006*"data" + 0.006*"orbit"'), (19, '0.012*"government" + 0.008*"public" + 0.006*"would" + 0.006*"security" + 0.006*"encryption" + 0.006*"state" + 0.005*"people" + 0.005*"control" + 0.005*"guns" + 0.005*"right"')]
!pip install pyLDAvis
Collecting pyLDAvis
Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
|████████████████████████████████| 1.6MB 4.2MB/s
Requirement already satisfied: wheel>=0.23.0 in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (0.34.2)
Requirement already satisfied: numpy>=1.9.2 in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (1.18.5)
Requirement already satisfied: scipy>=0.18.0 in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (1.4.1)
Requirement already satisfied: pandas>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (1.0.5)
Requirement already satisfied: joblib>=0.8.4 in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (0.16.0)
Requirement already satisfied: jinja2>=2.7.2 in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (2.11.2)
Requirement already satisfied: numexpr in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (2.7.1)
Requirement already satisfied: pytest in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (3.6.4)
Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from pyLDAvis) (0.16.0)
Collecting funcy
Downloading https://files.pythonhosted.org/packages/ce/4b/6ffa76544e46614123de31574ad95758c421aae391a1764921b8a81e1eae/funcy-1.14.tar.gz (548kB)
|████████████████████████████████| 552kB 21.6MB/s
Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.17.0->pyLDAvis) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.17.0->pyLDAvis) (2018.9)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2>=2.7.2->pyLDAvis) (1.1.1)
Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (19.3.0)
Requirement already satisfied: more-itertools>=4.0.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (8.4.0)
Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (49.2.0)
Requirement already satisfied: atomicwrites>=1.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (1.4.0)
Requirement already satisfied: py>=1.5.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (1.9.0)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (1.15.0)
Requirement already satisfied: pluggy<0.8,>=0.5 in /usr/local/lib/python3.6/dist-packages (from pytest->pyLDAvis) (0.7.1)
Building wheels for collected packages: pyLDAvis, funcy
Building wheel for pyLDAvis (setup.py) ... done
Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97711 sha256=52f0dea0b40710aa11fba08c35c1d2398832d2064c539c40cbb60aee5e8280f6
Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
Building wheel for funcy (setup.py) ... done
Created wheel for funcy: filename=funcy-1.14-py2.py3-none-any.whl size=32042 sha256=6a38dff8d5f3d66dc80e430c3bd50ba1f5a266f23c8f6c32da0f6f238a91669e
Stored in directory: /root/.cache/pip/wheels/20/5a/d8/1d875df03deae6f178dfdf70238cca33f948ef8a6f5209f2eb
Successfully built pyLDAvis funcy
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.14 pyLDAvis-2.1.2
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)
for i, topic_list in enumerate(ldamodel[corpus]):
if i==5:
break
print(i,'번째 문서의 topic 비율은',topic_list)
0 번째 문서의 topic 비율은 [(4, 0.34923318), (8, 0.3780517), (16, 0.2590055)] 1 번째 문서의 topic 비율은 [(1, 0.38693827), (8, 0.49144486), (12, 0.10088517)] 2 번째 문서의 topic 비율은 [(0, 0.04607978), (8, 0.25842667), (12, 0.3245097), (16, 0.30464748), (17, 0.054239646)] 3 번째 문서의 topic 비율은 [(1, 0.12215593), (8, 0.12693422), (11, 0.16771415), (12, 0.23100159), (19, 0.34100008)] 4 번째 문서의 topic 비율은 [(1, 0.40955263), (7, 0.24102388), (12, 0.24536817), (14, 0.07442568)]
def make_topictable_per_doc(ldamodel, corpus):
topic_table = pd.DataFrame()
# 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
for i, topic_list in enumerate(ldamodel[corpus]):
doc = topic_list[0] if ldamodel.per_word_topics else topic_list
doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
# 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
# EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%),
# Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
# 48 > 25 > 21 > 5 순으로 정렬이 된 것.
# 모든 문서에 대해서 각각 아래를 수행
for j, (topic_num, prop_topic) in enumerate(doc): # 몇 번 토픽인지와 비중을 나눠서 저장한다.
if j == 0: # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
# 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
else:
break
return(topic_table)
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]
| 문서 번호 | 가장 비중이 높은 토픽 | 가장 높은 토픽의 비중 | 각 토픽의 비중 | |
|---|---|---|---|---|
| 0 | 0 | 8.0 | 0.3780 | [(4, 0.34922978), (8, 0.37804338), (16, 0.2590... |
| 1 | 1 | 8.0 | 0.4914 | [(1, 0.3868836), (8, 0.4913948), (12, 0.100989... |
| 2 | 2 | 12.0 | 0.3245 | [(0, 0.046079557), (8, 0.25844014), (12, 0.324... |
| 3 | 3 | 19.0 | 0.3410 | [(1, 0.12217032), (8, 0.12684505), (11, 0.1677... |
| 4 | 4 | 1.0 | 0.4095 | [(1, 0.40951708), (7, 0.24102204), (12, 0.2454... |
| 5 | 5 | 1.0 | 0.3399 | [(1, 0.33988795), (8, 0.30458233), (10, 0.0438... |
| 6 | 6 | 11.0 | 0.7687 | [(1, 0.13989833), (11, 0.76868165), (12, 0.052... |
| 7 | 7 | 4.0 | 0.3474 | [(4, 0.34736708), (8, 0.30865473), (12, 0.2449... |
| 8 | 8 | 10.0 | 0.6318 | [(8, 0.1317907), (10, 0.63179904), (12, 0.2134... |
| 9 | 9 | 12.0 | 0.3579 | [(1, 0.2951199), (2, 0.043202877), (6, 0.10784... |
import pandas as pd
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv", filename="abcnews-date-text.csv")
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)
print(len(data))
1082168
print(data.head(5))
publish_date headline_text 0 20030219 aba decides against community broadcasting lic... 1 20030219 act fire witnesses must be aware of defamation 2 20030219 a g calls for infrastructure protection summit 3 20030219 air nz staff in aust strike for pay rise 4 20030219 air nz strike to affect australian travellers
text = data[['headline_text']]
text.head(5)
| headline_text | |
|---|---|
| 0 | aba decides against community broadcasting lic... |
| 1 | act fire witnesses must be aware of defamation |
| 2 | a g calls for infrastructure protection summit |
| 3 | air nz staff in aust strike for pay rise |
| 4 | air nz strike to affect australian travellers |
import nltk
nltk.download('punkt')
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip.
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy This is separate from the ipykernel package so we can avoid doing imports until
print(text.head(5))
headline_text 0 [aba, decides, against, community, broadcastin... 1 [act, fire, witnesses, must, be, aware, of, de... 2 [a, g, calls, for, infrastructure, protection,... 3 [air, nz, staff, in, aust, strike, for, pay, r... 4 [air, nz, strike, to, affect, australian, trav...
# 불용어 제거
from nltk.corpus import stopwords
stop = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop)])
print(text.head(5))
headline_text 0 [aba, decides, community, broadcasting, licence] 1 [act, fire, witnesses, must, aware, defamation] 2 [g, calls, infrastructure, protection, summit] 3 [air, nz, staff, aust, strike, pay, rise] 4 [air, nz, strike, affect, australian, travellers]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy This is separate from the ipykernel package so we can avoid doing imports until
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to /root/nltk_data... [nltk_data] Unzipping corpora/wordnet.zip.
True
# 표제어 추출
from nltk.stem import WordNetLemmatizer
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
print(text.head(5))
headline_text 0 [aba, decide, community, broadcast, licence] 1 [act, fire, witness, must, aware, defamation] 2 [g, call, infrastructure, protection, summit] 3 [air, nz, staff, aust, strike, pay, rise] 4 [air, nz, strike, affect, australian, travellers]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# 길이가 3이하인 단어 제거
tokenized_doc = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 3])
print(tokenized_doc[:5])
0 [decide, community, broadcast, licence] 1 [fire, witness, must, aware, defamation] 2 [call, infrastructure, protection, summit] 3 [staff, aust, strike, rise] 4 [strike, affect, australian, travellers] Name: headline_text, dtype: object
# 역토큰화 (토큰화 작업을 되돌림)
detokenized_doc = []
for i in range(len(text)):
t = ' '.join(tokenized_doc[i])
detokenized_doc.append(t)
text['headline_text'] = detokenized_doc # 다시 text['headline_text']에 재저장
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy import sys
text['headline_text'][:5]
0 decide community broadcast licence 1 fire witness must aware defamation 2 call infrastructure protection summit 3 staff aust strike rise 4 strike affect australian travellers Name: headline_text, dtype: object
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english',
max_features= 1000) # 상위 1,000개의 단어를 보존
X = vectorizer.fit_transform(text['headline_text'])
X.shape # TF-IDF 행렬의 크기 확인
(1082168, 1000)
from sklearn.decomposition import LatentDirichletAllocation
lda_model=LatentDirichletAllocation(n_components=10,learning_method='online',random_state=777,max_iter=1)
lda_top=lda_model.fit_transform(X)
print(lda_model.components_)
print(lda_model.components_.shape)
[[1.00001533e-01 1.00001269e-01 1.00004179e-01 ... 1.00006124e-01 1.00003111e-01 1.00003064e-01] [1.00001199e-01 1.13513398e+03 3.50170830e+03 ... 1.00009349e-01 1.00001896e-01 1.00002937e-01] [1.00001811e-01 1.00001151e-01 1.00003566e-01 ... 1.00002693e-01 1.00002061e-01 7.53381835e+02] ... [1.00001065e-01 1.00001689e-01 1.00003278e-01 ... 1.00006721e-01 1.00004902e-01 1.00004759e-01] [1.00002401e-01 1.00000732e-01 1.00002989e-01 ... 1.00003517e-01 1.00001428e-01 1.00005266e-01] [1.00003427e-01 1.00002313e-01 1.00007340e-01 ... 1.00003732e-01 1.00001207e-01 1.00005153e-01]] (10, 1000)
terms = vectorizer.get_feature_names() # 단어 집합. 1,000개의 단어가 저장됨.
def get_topics(components, feature_names, n=5):
for idx, topic in enumerate(components):
print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lda_model.components_,terms)
Topic 1: [('government', 8725.19), ('sydney', 8393.29), ('queensland', 7720.12), ('change', 5874.27), ('home', 5674.38)]
Topic 2: [('australia', 13691.08), ('australian', 11088.95), ('melbourne', 7528.43), ('world', 6707.7), ('south', 6677.03)]
Topic 3: [('death', 5935.06), ('interview', 5924.98), ('kill', 5851.6), ('jail', 4632.85), ('life', 4275.27)]
Topic 4: [('house', 6113.49), ('2016', 5488.19), ('state', 4923.41), ('brisbane', 4857.21), ('tasmania', 4610.97)]
Topic 5: [('court', 7542.74), ('attack', 6959.64), ('open', 5663.0), ('face', 5193.63), ('warn', 5115.01)]
Topic 6: [('market', 5545.86), ('rural', 5502.89), ('plan', 4828.71), ('indigenous', 4223.4), ('power', 3968.26)]
Topic 7: [('charge', 8428.8), ('election', 7561.63), ('adelaide', 6758.36), ('make', 5658.99), ('test', 5062.69)]
Topic 8: [('police', 12092.44), ('crash', 5281.14), ('drug', 4290.87), ('beat', 3257.58), ('rise', 2934.92)]
Topic 9: [('fund', 4693.03), ('labor', 4047.69), ('national', 4038.68), ('council', 4006.62), ('claim', 3604.75)]
Topic 10: [('trump', 11966.41), ('perth', 6456.53), ('report', 5611.33), ('school', 5465.06), ('woman', 5456.76)]
은닉층의 노드에서 활성화 함수를 통해 나온 결과값을 출력층 방향으로도 보내면서, 다시 은닉층 노드의 다음 계산의 입력으로 보내는 특징을 가지고 있음.
x는 입력벡터, y는 출력벡터. 은닉층에서 활성화 함수를 통해 결과를 내보내는 역할을 하는 노드를 셀(Cell)이라고 함. 이전 값을 기억하려는 일종의 메모리 역할이므로, 메모리 셀 또는 RNN셀 이라고도 함.
RNN의 은닉층 h(t): 주로 하이퍼볼릭탄젠트(tanh)가 사용, ReLU를 사용하기도 함.
# RNN 층을 추가하는 코드.
model.add(SimpleRNN(hidden_size)) # 가장 간단한 형태
# 추가 인자를 사용할 때
model.add(SimpleRNN(hidden_size, input_shape=(timesteps, input_dim)))
# 다른 표기
model.add(SimpleRNN(hidden_size, input_length=M, input_dim=N))
# 단, M과 N은 정수
from keras.models import Sequential
from keras.layers import SimpleRNN
model = Sequential()
model.add(SimpleRNN(3, input_shape=(2,10)))
# model.add(SimpleRNN(3, input_length=2, input_dim=10))와 동일함.
model.summary()
Model: "sequential_4" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= simple_rnn (SimpleRNN) (None, 3) 42 ================================================================= Total params: 42 Trainable params: 42 Non-trainable params: 0 _________________________________________________________________
model = Sequential()
model.add(SimpleRNN(3, batch_input_shape=(8,2,10)))
model.summary()
Model: "sequential_5" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= simple_rnn_1 (SimpleRNN) (8, 3) 42 ================================================================= Total params: 42 Trainable params: 42 Non-trainable params: 0 _________________________________________________________________
# 실제 GRU 은닉층을 추가하는 코드.
model.add(LSTM(hidden_size, input_shape=(timesteps, input_dim)))
LSTM보다 학습속도가 빠르다고 알려져 있으나, 대체로 비슷한 성능을 보인다고 알려져 있음.
데이터 양이 적을 때는, 매개 변수 양이 적은 GRU, 데이터 양이 많을 때는 LSTM이 적합하다고 함.
# 실제 GRU 은닉층을 추가하는 코드.
model.add(GRU(hidden_size, input_shape=(timesteps, input_dim)))
N-gram과 달리 RNN으로 언어 모델을 만들면 입력의 길이를 고정하지 않을 수 있음.
ex) "what will the fat cat sit on"
RNNLM은 기본적으로 예측 과정에서 이전 시점의 출력을 현재 시점의 입력으로 한다. (y1 -> x2)
손실함수로 크로스 엔트로피 함수를 사용.
RNNLM은 4개의 레이어로 이루어진 인공신경망임.
출력층: fat 다음에 오는 단어인 cat의 원-핫 벡터가 모델이 예측한 값의 오차를 구하기 위해 사용됨. 즉, 오차로부터 손실함수를 이용해 학습이 진행됨.
원핫 벡터를 입력받은 RNNLM은 임베딩 레이어(투사층)를 지난다. 여기에서 단어집합의 크기가 V, 임베딩 벡터의 크기가 M일 때, 각 입력 단어들은 임베딩층에서 V * M 크기의 임베딩 행렬과 곱해짐. 이 임베딩 행렬은 역전파 과정에서 다른 가중치들과 함께 학습됨.
이렇게 나온 e(t)는 은닉층으로 넘겨지게 되고, 이전 시점(t - 1)의 은닉상태인 h(t - 1)과 연산을 통해 현재 시점(t)의 은닉상태 h(t)를 계산하게 됨. (하이퍼볼릭 탄젠트 함수 이용)
출력층에서는 활성화 함수로 소프트맥스 사용. 소프트맥스 함수를 이용해 얻은 값은 실제 정답 단어의 원핫 벡터의 값에 가까워져야 하기 때문에, 여기에서 손실함수로 크로스엔트로피 함수를 사용.
역전파가 이루어지면서 가중치 행렬들이 학습되는데, 여기에서 임베딩 벡터값들 역시 학습이 됨.
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.utils import to_categorical
text="""경마장에 있는 말이 뛰고 있다\n
그의 말이 법이다\n
가는 말이 고와야 오는 말이 곱다\n"""
t = Tokenizer()
t.fit_on_texts([text])
vocab_size = len(t.word_index) + 1
# 케라스 토크나이저의 정수 인코딩은 인덱스가 1부터 시작하지만,
# 케라스 원-핫 인코딩에서 배열의 인덱스가 0부터 시작하기 때문에
# 배열의 크기를 실제 단어 집합의 크기보다 +1로 생성해야하므로 미리 +1 선언
print('단어 집합의 크기 : %d' % vocab_size)
print(t.word_index)
단어 집합의 크기 : 12
{'말이': 1, '경마장에': 2, '있는': 3, '뛰고': 4, '있다': 5, '그의': 6, '법이다': 7, '가는': 8, '고와야': 9, '오는': 10, '곱다': 11}
# 정수인코딩
sequences = list()
for line in text.split('\n'): # Wn을 기준으로 문장 토큰화
encoded = t.texts_to_sequences([line])[0]
for i in range(1, len(encoded)):
sequence = encoded[:i+1]
sequences.append(sequence)
print('학습에 사용할 샘플의 개수: %d' % len(sequences))
print(sequences)
학습에 사용할 샘플의 개수: 11 [[2, 3], [2, 3, 1], [2, 3, 1, 4], [2, 3, 1, 4, 5], [6, 1], [6, 1, 7], [8, 1], [8, 1, 9], [8, 1, 9, 10], [8, 1, 9, 10, 1], [8, 1, 9, 10, 1, 11]]
# 패딩
max_len=max(len(l) for l in sequences) # 모든 샘플에서 길이가 가장 긴 샘플의 길이 출력
print('샘플의 최대 길이 : {}'.format(max_len))
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
print(sequences)
샘플의 최대 길이 : 6 [[ 0 0 0 0 2 3] [ 0 0 0 2 3 1] [ 0 0 2 3 1 4] [ 0 2 3 1 4 5] [ 0 0 0 0 6 1] [ 0 0 0 6 1 7] [ 0 0 0 0 8 1] [ 0 0 0 8 1 9] [ 0 0 8 1 9 10] [ 0 8 1 9 10 1] [ 8 1 9 10 1 11]]
# 레이블 분리
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]
# 리스트의 마지막 값을 제외하고 저장한 것은 X
# 리스트의 마지막 값만 저장한 것은 y. 이는 레이블에 해당됨.
print(X)
print(y) # 있는 말이 뛰고 있다 말이 법이다 말이 고와야 오는 말이 곱다
[[ 0 0 0 0 2] [ 0 0 0 2 3] [ 0 0 2 3 1] [ 0 2 3 1 4] [ 0 0 0 0 6] [ 0 0 0 6 1] [ 0 0 0 0 8] [ 0 0 0 8 1] [ 0 0 8 1 9] [ 0 8 1 9 10] [ 8 1 9 10 1]] [ 3 1 4 5 1 7 1 9 10 1 11]
# 원핫 인코딩
y = to_categorical(y, num_classes=vocab_size)
print(y)
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.] [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.] [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.] # 3에 대한 원-핫 벡터
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] # 1에 대한 원-핫 벡터
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.] # 4에 대한 원-핫 벡터
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] # 5에 대한 원-핫 벡터
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] # 1에 대한 원-핫 벡터
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.] # 7에 대한 원-핫 벡터
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] # 1에 대한 원-핫 벡터
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] # 9에 대한 원-핫 벡터
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.] # 10에 대한 원-핫 벡터
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] # 1에 대한 원-핫 벡터
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]] # 11에 대한 원-핫 벡터
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_len-1)) # 레이블을 분리하였으므로 이제 X의 길이는 5
model.add(SimpleRNN(32))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)
Epoch 1/200 1/1 - 0s - loss: 2.5012 - accuracy: 0.0909 Epoch 2/200 1/1 - 0s - loss: 2.4883 - accuracy: 0.0909 Epoch 3/200 1/1 - 0s - loss: 2.4756 - accuracy: 0.1818 Epoch 4/200 1/1 - 0s - loss: 2.4628 - accuracy: 0.3636 Epoch 5/200 1/1 - 0s - loss: 2.4499 - accuracy: 0.3636 Epoch 6/200 1/1 - 0s - loss: 2.4368 - accuracy: 0.3636 Epoch 7/200 1/1 - 0s - loss: 2.4233 - accuracy: 0.3636 Epoch 8/200 1/1 - 0s - loss: 2.4095 - accuracy: 0.4545 Epoch 9/200 1/1 - 0s - loss: 2.3952 - accuracy: 0.5455 Epoch 10/200 1/1 - 0s - loss: 2.3804 - accuracy: 0.5455 Epoch 11/200 1/1 - 0s - loss: 2.3650 - accuracy: 0.5455 Epoch 12/200 1/1 - 0s - loss: 2.3491 - accuracy: 0.5455 Epoch 13/200 1/1 - 0s - loss: 2.3325 - accuracy: 0.4545 Epoch 14/200 1/1 - 0s - loss: 2.3152 - accuracy: 0.4545 Epoch 15/200 1/1 - 0s - loss: 2.2971 - accuracy: 0.4545 Epoch 16/200 1/1 - 0s - loss: 2.2783 - accuracy: 0.4545 Epoch 17/200 1/1 - 0s - loss: 2.2587 - accuracy: 0.4545 Epoch 18/200 1/1 - 0s - loss: 2.2384 - accuracy: 0.4545 Epoch 19/200 1/1 - 0s - loss: 2.2173 - accuracy: 0.3636 Epoch 20/200 1/1 - 0s - loss: 2.1955 - accuracy: 0.3636 Epoch 21/200 1/1 - 0s - loss: 2.1731 - accuracy: 0.3636 Epoch 22/200 1/1 - 0s - loss: 2.1502 - accuracy: 0.3636 Epoch 23/200 1/1 - 0s - loss: 2.1270 - accuracy: 0.3636 Epoch 24/200 1/1 - 0s - loss: 2.1035 - accuracy: 0.3636 Epoch 25/200 1/1 - 0s - loss: 2.0800 - accuracy: 0.3636 Epoch 26/200 1/1 - 0s - loss: 2.0567 - accuracy: 0.3636 Epoch 27/200 1/1 - 0s - loss: 2.0337 - accuracy: 0.3636 Epoch 28/200 1/1 - 0s - loss: 2.0114 - accuracy: 0.3636 Epoch 29/200 1/1 - 0s - loss: 1.9897 - accuracy: 0.3636 Epoch 30/200 1/1 - 0s - loss: 1.9690 - accuracy: 0.3636 Epoch 31/200 1/1 - 0s - loss: 1.9492 - accuracy: 0.3636 Epoch 32/200 1/1 - 0s - loss: 1.9302 - accuracy: 0.3636 Epoch 33/200 1/1 - 0s - loss: 1.9120 - accuracy: 0.3636 Epoch 34/200 1/1 - 0s - loss: 1.8943 - accuracy: 0.3636 Epoch 35/200 1/1 - 0s - loss: 1.8770 - accuracy: 0.3636 Epoch 36/200 1/1 - 0s - loss: 1.8597 - accuracy: 0.3636 Epoch 37/200 1/1 - 0s - loss: 1.8422 - accuracy: 0.3636 Epoch 38/200 1/1 - 0s - loss: 1.8243 - accuracy: 0.3636 Epoch 39/200 1/1 - 0s - loss: 1.8060 - accuracy: 0.3636 Epoch 40/200 1/1 - 0s - loss: 1.7871 - accuracy: 0.3636 Epoch 41/200 1/1 - 0s - loss: 1.7677 - accuracy: 0.4545 Epoch 42/200 1/1 - 0s - loss: 1.7479 - accuracy: 0.4545 Epoch 43/200 1/1 - 0s - loss: 1.7276 - accuracy: 0.4545 Epoch 44/200 1/1 - 0s - loss: 1.7072 - accuracy: 0.4545 Epoch 45/200 1/1 - 0s - loss: 1.6865 - accuracy: 0.4545 Epoch 46/200 1/1 - 0s - loss: 1.6659 - accuracy: 0.4545 Epoch 47/200 1/1 - 0s - loss: 1.6453 - accuracy: 0.4545 Epoch 48/200 1/1 - 0s - loss: 1.6247 - accuracy: 0.4545 Epoch 49/200 1/1 - 0s - loss: 1.6043 - accuracy: 0.4545 Epoch 50/200 1/1 - 0s - loss: 1.5839 - accuracy: 0.4545 Epoch 51/200 1/1 - 0s - loss: 1.5636 - accuracy: 0.4545 Epoch 52/200 1/1 - 0s - loss: 1.5433 - accuracy: 0.4545 Epoch 53/200 1/1 - 0s - loss: 1.5230 - accuracy: 0.4545 Epoch 54/200 1/1 - 0s - loss: 1.5027 - accuracy: 0.5455 Epoch 55/200 1/1 - 0s - loss: 1.4825 - accuracy: 0.5455 Epoch 56/200 1/1 - 0s - loss: 1.4623 - accuracy: 0.5455 Epoch 57/200 1/1 - 0s - loss: 1.4423 - accuracy: 0.5455 Epoch 58/200 1/1 - 0s - loss: 1.4224 - accuracy: 0.5455 Epoch 59/200 1/1 - 0s - loss: 1.4027 - accuracy: 0.5455 Epoch 60/200 1/1 - 0s - loss: 1.3831 - accuracy: 0.5455 Epoch 61/200 1/1 - 0s - loss: 1.3638 - accuracy: 0.5455 Epoch 62/200 1/1 - 0s - loss: 1.3446 - accuracy: 0.5455 Epoch 63/200 1/1 - 0s - loss: 1.3254 - accuracy: 0.5455 Epoch 64/200 1/1 - 0s - loss: 1.3064 - accuracy: 0.5455 Epoch 65/200 1/1 - 0s - loss: 1.2875 - accuracy: 0.5455 Epoch 66/200 1/1 - 0s - loss: 1.2686 - accuracy: 0.5455 Epoch 67/200 1/1 - 0s - loss: 1.2498 - accuracy: 0.5455 Epoch 68/200 1/1 - 0s - loss: 1.2310 - accuracy: 0.5455 Epoch 69/200 1/1 - 0s - loss: 1.2124 - accuracy: 0.5455 Epoch 70/200 1/1 - 0s - loss: 1.1938 - accuracy: 0.5455 Epoch 71/200 1/1 - 0s - loss: 1.1753 - accuracy: 0.5455 Epoch 72/200 1/1 - 0s - loss: 1.1569 - accuracy: 0.5455 Epoch 73/200 1/1 - 0s - loss: 1.1386 - accuracy: 0.6364 Epoch 74/200 1/1 - 0s - loss: 1.1204 - accuracy: 0.6364 Epoch 75/200 1/1 - 0s - loss: 1.1023 - accuracy: 0.6364 Epoch 76/200 1/1 - 0s - loss: 1.0843 - accuracy: 0.6364 Epoch 77/200 1/1 - 0s - loss: 1.0664 - accuracy: 0.6364 Epoch 78/200 1/1 - 0s - loss: 1.0487 - accuracy: 0.6364 Epoch 79/200 1/1 - 0s - loss: 1.0311 - accuracy: 0.6364 Epoch 80/200 1/1 - 0s - loss: 1.0136 - accuracy: 0.6364 Epoch 81/200 1/1 - 0s - loss: 0.9963 - accuracy: 0.6364 Epoch 82/200 1/1 - 0s - loss: 0.9791 - accuracy: 0.6364 Epoch 83/200 1/1 - 0s - loss: 0.9621 - accuracy: 0.6364 Epoch 84/200 1/1 - 0s - loss: 0.9453 - accuracy: 0.6364 Epoch 85/200 1/1 - 0s - loss: 0.9286 - accuracy: 0.6364 Epoch 86/200 1/1 - 0s - loss: 0.9121 - accuracy: 0.6364 Epoch 87/200 1/1 - 0s - loss: 0.8958 - accuracy: 0.6364 Epoch 88/200 1/1 - 0s - loss: 0.8797 - accuracy: 0.6364 Epoch 89/200 1/1 - 0s - loss: 0.8638 - accuracy: 0.6364 Epoch 90/200 1/1 - 0s - loss: 0.8481 - accuracy: 0.7273 Epoch 91/200 1/1 - 0s - loss: 0.8326 - accuracy: 0.7273 Epoch 92/200 1/1 - 0s - loss: 0.8173 - accuracy: 0.7273 Epoch 93/200 1/1 - 0s - loss: 0.8022 - accuracy: 0.8182 Epoch 94/200 1/1 - 0s - loss: 0.7873 - accuracy: 0.8182 Epoch 95/200 1/1 - 0s - loss: 0.7726 - accuracy: 0.8182 Epoch 96/200 1/1 - 0s - loss: 0.7581 - accuracy: 0.8182 Epoch 97/200 1/1 - 0s - loss: 0.7438 - accuracy: 0.8182 Epoch 98/200 1/1 - 0s - loss: 0.7296 - accuracy: 0.8182 Epoch 99/200 1/1 - 0s - loss: 0.7157 - accuracy: 0.8182 Epoch 100/200 1/1 - 0s - loss: 0.7019 - accuracy: 0.8182 Epoch 101/200 1/1 - 0s - loss: 0.6883 - accuracy: 0.8182 Epoch 102/200 1/1 - 0s - loss: 0.6749 - accuracy: 0.8182 Epoch 103/200 1/1 - 0s - loss: 0.6616 - accuracy: 0.8182 Epoch 104/200 1/1 - 0s - loss: 0.6486 - accuracy: 0.8182 Epoch 105/200 1/1 - 0s - loss: 0.6357 - accuracy: 0.8182 Epoch 106/200 1/1 - 0s - loss: 0.6229 - accuracy: 0.8182 Epoch 107/200 1/1 - 0s - loss: 0.6104 - accuracy: 0.8182 Epoch 108/200 1/1 - 0s - loss: 0.5980 - accuracy: 0.8182 Epoch 109/200 1/1 - 0s - loss: 0.5859 - accuracy: 0.9091 Epoch 110/200 1/1 - 0s - loss: 0.5739 - accuracy: 0.9091 Epoch 111/200 1/1 - 0s - loss: 0.5620 - accuracy: 0.9091 Epoch 112/200 1/1 - 0s - loss: 0.5504 - accuracy: 0.9091 Epoch 113/200 1/1 - 0s - loss: 0.5390 - accuracy: 0.9091 Epoch 114/200 1/1 - 0s - loss: 0.5277 - accuracy: 0.9091 Epoch 115/200 1/1 - 0s - loss: 0.5166 - accuracy: 0.9091 Epoch 116/200 1/1 - 0s - loss: 0.5057 - accuracy: 0.9091 Epoch 117/200 1/1 - 0s - loss: 0.4950 - accuracy: 0.9091 Epoch 118/200 1/1 - 0s - loss: 0.4845 - accuracy: 0.9091 Epoch 119/200 1/1 - 0s - loss: 0.4742 - accuracy: 0.9091 Epoch 120/200 1/1 - 0s - loss: 0.4640 - accuracy: 0.9091 Epoch 121/200 1/1 - 0s - loss: 0.4541 - accuracy: 0.9091 Epoch 122/200 1/1 - 0s - loss: 0.4444 - accuracy: 0.9091 Epoch 123/200 1/1 - 0s - loss: 0.4348 - accuracy: 0.9091 Epoch 124/200 1/1 - 0s - loss: 0.4254 - accuracy: 0.9091 Epoch 125/200 1/1 - 0s - loss: 0.4162 - accuracy: 0.9091 Epoch 126/200 1/1 - 0s - loss: 0.4073 - accuracy: 0.9091 Epoch 127/200 1/1 - 0s - loss: 0.3985 - accuracy: 0.9091 Epoch 128/200 1/1 - 0s - loss: 0.3898 - accuracy: 0.9091 Epoch 129/200 1/1 - 0s - loss: 0.3814 - accuracy: 0.9091 Epoch 130/200 1/1 - 0s - loss: 0.3732 - accuracy: 0.9091 Epoch 131/200 1/1 - 0s - loss: 0.3651 - accuracy: 0.9091 Epoch 132/200 1/1 - 0s - loss: 0.3573 - accuracy: 0.9091 Epoch 133/200 1/1 - 0s - loss: 0.3496 - accuracy: 0.9091 Epoch 134/200 1/1 - 0s - loss: 0.3420 - accuracy: 1.0000 Epoch 135/200 1/1 - 0s - loss: 0.3347 - accuracy: 1.0000 Epoch 136/200 1/1 - 0s - loss: 0.3275 - accuracy: 1.0000 Epoch 137/200 1/1 - 0s - loss: 0.3205 - accuracy: 1.0000 Epoch 138/200 1/1 - 0s - loss: 0.3137 - accuracy: 1.0000 Epoch 139/200 1/1 - 0s - loss: 0.3070 - accuracy: 1.0000 Epoch 140/200 1/1 - 0s - loss: 0.3005 - accuracy: 1.0000 Epoch 141/200 1/1 - 0s - loss: 0.2941 - accuracy: 1.0000 Epoch 142/200 1/1 - 0s - loss: 0.2879 - accuracy: 1.0000 Epoch 143/200 1/1 - 0s - loss: 0.2819 - accuracy: 1.0000 Epoch 144/200 1/1 - 0s - loss: 0.2760 - accuracy: 1.0000 Epoch 145/200 1/1 - 0s - loss: 0.2702 - accuracy: 1.0000 Epoch 146/200 1/1 - 0s - loss: 0.2646 - accuracy: 1.0000 Epoch 147/200 1/1 - 0s - loss: 0.2591 - accuracy: 1.0000 Epoch 148/200 1/1 - 0s - loss: 0.2538 - accuracy: 1.0000 Epoch 149/200 1/1 - 0s - loss: 0.2486 - accuracy: 1.0000 Epoch 150/200 1/1 - 0s - loss: 0.2435 - accuracy: 1.0000 Epoch 151/200 1/1 - 0s - loss: 0.2386 - accuracy: 1.0000 Epoch 152/200 1/1 - 0s - loss: 0.2337 - accuracy: 1.0000 Epoch 153/200 1/1 - 0s - loss: 0.2290 - accuracy: 1.0000 Epoch 154/200 1/1 - 0s - loss: 0.2244 - accuracy: 1.0000 Epoch 155/200 1/1 - 0s - loss: 0.2199 - accuracy: 1.0000 Epoch 156/200 1/1 - 0s - loss: 0.2156 - accuracy: 1.0000 Epoch 157/200 1/1 - 0s - loss: 0.2113 - accuracy: 1.0000 Epoch 158/200 1/1 - 0s - loss: 0.2072 - accuracy: 1.0000 Epoch 159/200 1/1 - 0s - loss: 0.2031 - accuracy: 1.0000 Epoch 160/200 1/1 - 0s - loss: 0.1992 - accuracy: 1.0000 Epoch 161/200 1/1 - 0s - loss: 0.1953 - accuracy: 1.0000 Epoch 162/200 1/1 - 0s - loss: 0.1916 - accuracy: 1.0000 Epoch 163/200 1/1 - 0s - loss: 0.1879 - accuracy: 1.0000 Epoch 164/200 1/1 - 0s - loss: 0.1843 - accuracy: 1.0000 Epoch 165/200 1/1 - 0s - loss: 0.1808 - accuracy: 1.0000 Epoch 166/200 1/1 - 0s - loss: 0.1774 - accuracy: 1.0000 Epoch 167/200 1/1 - 0s - loss: 0.1741 - accuracy: 1.0000 Epoch 168/200 1/1 - 0s - loss: 0.1709 - accuracy: 1.0000 Epoch 169/200 1/1 - 0s - loss: 0.1677 - accuracy: 1.0000 Epoch 170/200 1/1 - 0s - loss: 0.1646 - accuracy: 1.0000 Epoch 171/200 1/1 - 0s - loss: 0.1616 - accuracy: 1.0000 Epoch 172/200 1/1 - 0s - loss: 0.1587 - accuracy: 1.0000 Epoch 173/200 1/1 - 0s - loss: 0.1558 - accuracy: 1.0000 Epoch 174/200 1/1 - 0s - loss: 0.1530 - accuracy: 1.0000 Epoch 175/200 1/1 - 0s - loss: 0.1502 - accuracy: 1.0000 Epoch 176/200 1/1 - 0s - loss: 0.1476 - accuracy: 1.0000 Epoch 177/200 1/1 - 0s - loss: 0.1450 - accuracy: 1.0000 Epoch 178/200 1/1 - 0s - loss: 0.1424 - accuracy: 1.0000 Epoch 179/200 1/1 - 0s - loss: 0.1399 - accuracy: 1.0000 Epoch 180/200 1/1 - 0s - loss: 0.1375 - accuracy: 1.0000 Epoch 181/200 1/1 - 0s - loss: 0.1351 - accuracy: 1.0000 Epoch 182/200 1/1 - 0s - loss: 0.1328 - accuracy: 1.0000 Epoch 183/200 1/1 - 0s - loss: 0.1305 - accuracy: 1.0000 Epoch 184/200 1/1 - 0s - loss: 0.1283 - accuracy: 1.0000 Epoch 185/200 1/1 - 0s - loss: 0.1261 - accuracy: 1.0000 Epoch 186/200 1/1 - 0s - loss: 0.1240 - accuracy: 1.0000 Epoch 187/200 1/1 - 0s - loss: 0.1219 - accuracy: 1.0000 Epoch 188/200 1/1 - 0s - loss: 0.1199 - accuracy: 1.0000 Epoch 189/200 1/1 - 0s - loss: 0.1179 - accuracy: 1.0000 Epoch 190/200 1/1 - 0s - loss: 0.1160 - accuracy: 1.0000 Epoch 191/200 1/1 - 0s - loss: 0.1141 - accuracy: 1.0000 Epoch 192/200 1/1 - 0s - loss: 0.1122 - accuracy: 1.0000 Epoch 193/200 1/1 - 0s - loss: 0.1104 - accuracy: 1.0000 Epoch 194/200 1/1 - 0s - loss: 0.1087 - accuracy: 1.0000 Epoch 195/200 1/1 - 0s - loss: 0.1069 - accuracy: 1.0000 Epoch 196/200 1/1 - 0s - loss: 0.1052 - accuracy: 1.0000 Epoch 197/200 1/1 - 0s - loss: 0.1036 - accuracy: 1.0000 Epoch 198/200 1/1 - 0s - loss: 0.1019 - accuracy: 1.0000 Epoch 199/200 1/1 - 0s - loss: 0.1004 - accuracy: 1.0000 Epoch 200/200 1/1 - 0s - loss: 0.0988 - accuracy: 1.0000
<tensorflow.python.keras.callbacks.History at 0x7f89b39bf828>
def sentence_generation(model, t, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
init_word = current_word # 처음 들어온 단어도 마지막에 같이 출력하기위해 저장
sentence = ''
for _ in range(n): # n번 반복
encoded = t.texts_to_sequences([current_word])[0] # 현재 단어에 대한 정수 인코딩
encoded = pad_sequences([encoded], maxlen=5, padding='pre') # 데이터에 대한 패딩
result = model.predict_classes(encoded, verbose=0)
# 입력한 X(현재 단어)에 대해서 Y를 예측하고 Y(예측한 단어)를 result에 저장.
for word, index in t.word_index.items():
if index == result: # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면
break # 해당 단어가 예측 단어이므로 break
current_word = current_word + ' ' + word # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
sentence = sentence + ' ' + word # 예측 단어를 문장에 저장
# for문이므로 이 행동을 다시 반복
sentence = init_word + sentence
return sentence
print(sentence_generation(model, t, '경마장에', 4)) # '경마장에' 라는 단어 뒤에는 총 4개의 단어가 있으므로 4번 예측
print(sentence_generation(model, t, '그의', 2)) # 2번 예측
print(sentence_generation(model, t, '가는', 5)) # 5번 예측
경마장에 있는 말이 뛰고 있다 그의 말이 법이다 가는 말이 고와야 오는 말이 곱다
import pandas as pd
from string import punctuation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.utils import to_categorical
# 뉴욕타임즈 기사의 제목
df = pd.read_csv('/content/datasets_19447_31436_ArticlesApril2018.csv')
df.head()
| articleID | articleWordCount | byline | documentType | headline | keywords | multimedia | newDesk | printPage | pubDate | sectionName | snippet | source | typeOfMaterial | webURL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5adf6684068401528a2aa69b | 781 | By JOHN BRANCH | article | Former N.F.L. Cheerleaders’ Settlement Offer: ... | ['Workplace Hazards and Violations', 'Football... | 68 | Sports | 0 | 2018-04-24 17:16:49 | Pro Football | “I understand that they could meet with us, pa... | The New York Times | News | https://www.nytimes.com/2018/04/24/sports/foot... |
| 1 | 5adf653f068401528a2aa697 | 656 | By LISA FRIEDMAN | article | E.P.A. to Unveil a New Rule. Its Effect: Less ... | ['Environmental Protection Agency', 'Pruitt, S... | 68 | Climate | 0 | 2018-04-24 17:11:21 | Unknown | The agency plans to publish a new regulation T... | The New York Times | News | https://www.nytimes.com/2018/04/24/climate/epa... |
| 2 | 5adf4626068401528a2aa628 | 2427 | By PETE WELLS | article | The New Noma, Explained | ['Restaurants', 'Noma (Copenhagen, Restaurant)... | 66 | Dining | 0 | 2018-04-24 14:58:44 | Unknown | What’s it like to eat at the second incarnatio... | The New York Times | News | https://www.nytimes.com/2018/04/24/dining/noma... |
| 3 | 5adf40d2068401528a2aa619 | 626 | By JULIE HIRSCHFELD DAVIS and PETER BAKER | article | Unknown | ['Macron, Emmanuel (1977- )', 'Trump, Donald J... | 68 | Washington | 0 | 2018-04-24 14:35:57 | Europe | President Trump welcomed President Emmanuel Ma... | The New York Times | News | https://www.nytimes.com/2018/04/24/world/europ... |
| 4 | 5adf3d64068401528a2aa60f | 815 | By IAN AUSTEN and DAN BILEFSKY | article | Unknown | ['Toronto, Ontario, Attack (April, 2018)', 'Mu... | 68 | Foreign | 0 | 2018-04-24 14:21:21 | Canada | Alek Minassian, 25, a resident of Toronto’s Ri... | The New York Times | News | https://www.nytimes.com/2018/04/24/world/canad... |
print('열의 개수: ',len(df.columns))
print(df.columns)
열의 개수: 15
Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
dtype='object')
# headline 열에서 NULL이 있는지 확인 False면 NULL이 없는 거.
df['headline'].isnull().values.any()
False
headline = [] # 리스트 선언
headline.extend(list(df.headline.values)) # 헤드라인의 값들을 리스트로 저장
headline[:5] # 상위 5개만 출력
['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell', 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.', 'The New Noma, Explained', 'Unknown', 'Unknown']
print('총 샘플의 개수 : {}'.format(len(headline)))
총 샘플의 개수 : 1324
headline = [n for n in headline if n != "Unknown"] # Unknown 값을 가진 샘플 제거
print('노이즈값 제거 후 샘플의 개수 : {}'.format(len(headline))) # 제거 후 샘플의 개수
노이즈값 제거 후 샘플의 개수 : 1214
headline[:5]
['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell', 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.', 'The New Noma, Explained', 'How a Bag of Texas Dirt Became a Times Tradition', 'Is School a Place for Self-Expression?']
# 데이터 전처리
def repreprocessing(s):
s=s.encode("utf8").decode("ascii",'ignore')
return ''.join(c for c in s if c not in punctuation).lower() # 구두점 제거와 동시에 소문자화
text = [repreprocessing(x) for x in headline]
text[:5]
['former nfl cheerleaders settlement offer 1 and a meeting with goodell', 'epa to unveil a new rule its effect less science in policymaking', 'the new noma explained', 'how a bag of texas dirt became a times tradition', 'is school a place for selfexpression']
# 단어집합 생성
t = Tokenizer()
t.fit_on_texts(text)
vocab_size = len(t.word_index) + 1
print('단어 집합의 크기 : %d' % vocab_size)
단어 집합의 크기 : 3494
# 정수인코딩, 하나의 문장을 여러 줄로 분해하여 훈련데이터 구성
sequences = list()
for line in text: # 1,214 개의 샘플에 대해서 샘플을 1개씩 가져온다.
encoded = t.texts_to_sequences([line])[0] # 각 샘플에 대한 정수 인코딩
for i in range(1, len(encoded)):
sequence = encoded[:i+1]
sequences.append(sequence)
sequences[:11] # 11개의 샘플 출력
[[99, 269], [99, 269, 371], [99, 269, 371, 1115], [99, 269, 371, 1115, 582], [99, 269, 371, 1115, 582, 52], [99, 269, 371, 1115, 582, 52, 7], [99, 269, 371, 1115, 582, 52, 7, 2], [99, 269, 371, 1115, 582, 52, 7, 2, 372], [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10], [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116], [100, 3]]
[[99, 269], # former nfl
[99, 269, 371], # former nfl cheerleaders
[99, 269, 371, 1115], # former nfl cheerleaders settlement
[99, 269, 371, 1115, 582], # former nfl cheerleaders settlement offer
[99, 269, 371, 1115, 582, 52], # 'former nfl cheerleaders settlement offer 1
[99, 269, 371, 1115, 582, 52, 7], # former nfl cheerleaders settlement offer 1 and
[99, 269, 371, 1115, 582, 52, 7, 2], # ... 이하 생략 ...
[99, 269, 371, 1115, 582, 52, 7, 2, 372],
[99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
[99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116], # 모든 단어가 사용된 완전한 첫번째 문장
# 바로 위의 줄 : former nfl cheerleaders settlement offer 1 and a meeting with goodell
[100, 3]] # epa to에 해당되며 두번째 문장이 시작됨.
# 결과값을 비교하기 위한 정답리스트 생성
index_to_word={}
for key, value in t.word_index.items(): # 인덱스를 단어로 바꾸기 위해 index_to_word를 생성
index_to_word[value] = key
print('빈도수 상위 582번 단어 : {}'.format(index_to_word[582]))
빈도수 상위 582번 단어 : offer
# 패딩
max_len=max(len(l) for l in sequences)
print('샘플의 최대 길이 : {}'.format(max_len))
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
print(sequences[:3])
샘플의 최대 길이 : 24
[[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 99 269]
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 99 269 371]
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 99 269 371 1115]]
# 맨 우측 단어만 레이블로 분리
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]
print(X[:3])
print(y[:3])
[[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 99]
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 99 269]
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 99 269 371]]
[ 269 371 1115]
# 원핫 인코딩
y = to_categorical(y, num_classes=vocab_size)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_len-1))
# y데이터를 분리하였으므로 이제 X데이터의 길이는 기존 데이터의 길이 - 1
model.add(LSTM(128))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)
Epoch 1/200 244/244 - 8s - loss: 7.6567 - accuracy: 0.0268 Epoch 2/200 244/244 - 8s - loss: 7.1273 - accuracy: 0.0308 Epoch 3/200 244/244 - 7s - loss: 6.9912 - accuracy: 0.0326 Epoch 4/200 244/244 - 7s - loss: 6.8772 - accuracy: 0.0391 Epoch 5/200 244/244 - 7s - loss: 6.7508 - accuracy: 0.0433 Epoch 6/200 244/244 - 7s - loss: 6.5997 - accuracy: 0.0460 Epoch 7/200 244/244 - 7s - loss: 6.4308 - accuracy: 0.0495 Epoch 8/200 244/244 - 7s - loss: 6.2386 - accuracy: 0.0542 Epoch 9/200 244/244 - 7s - loss: 6.0448 - accuracy: 0.0573 Epoch 10/200 244/244 - 7s - loss: 5.8539 - accuracy: 0.0610 Epoch 11/200 244/244 - 7s - loss: 5.6696 - accuracy: 0.0654 Epoch 12/200 244/244 - 7s - loss: 5.4941 - accuracy: 0.0752 Epoch 13/200 244/244 - 7s - loss: 5.3296 - accuracy: 0.0788 Epoch 14/200 244/244 - 7s - loss: 5.1784 - accuracy: 0.0788 Epoch 15/200 244/244 - 7s - loss: 5.0258 - accuracy: 0.0929 Epoch 16/200 244/244 - 7s - loss: 4.8838 - accuracy: 0.0969 Epoch 17/200 244/244 - 7s - loss: 4.7489 - accuracy: 0.1084 Epoch 18/200 244/244 - 7s - loss: 4.6216 - accuracy: 0.1234 Epoch 19/200 244/244 - 7s - loss: 4.4997 - accuracy: 0.1396 Epoch 20/200 244/244 - 7s - loss: 4.3785 - accuracy: 0.1507 Epoch 21/200 244/244 - 7s - loss: 4.2630 - accuracy: 0.1672 Epoch 22/200 244/244 - 7s - loss: 4.1519 - accuracy: 0.1880 Epoch 23/200 244/244 - 7s - loss: 4.0419 - accuracy: 0.2047 Epoch 24/200 244/244 - 7s - loss: 3.9346 - accuracy: 0.2185 Epoch 25/200 244/244 - 7s - loss: 3.8341 - accuracy: 0.2336 Epoch 26/200 244/244 - 7s - loss: 3.7338 - accuracy: 0.2570 Epoch 27/200 244/244 - 7s - loss: 3.6395 - accuracy: 0.2681 Epoch 28/200 244/244 - 7s - loss: 3.5421 - accuracy: 0.2881 Epoch 29/200 244/244 - 7s - loss: 3.4536 - accuracy: 0.3085 Epoch 30/200 244/244 - 8s - loss: 3.3668 - accuracy: 0.3215 Epoch 31/200 244/244 - 7s - loss: 3.2832 - accuracy: 0.3333 Epoch 32/200 244/244 - 7s - loss: 3.2039 - accuracy: 0.3505 Epoch 33/200 244/244 - 7s - loss: 3.1250 - accuracy: 0.3691 Epoch 34/200 244/244 - 7s - loss: 3.0489 - accuracy: 0.3836 Epoch 35/200 244/244 - 7s - loss: 2.9771 - accuracy: 0.3969 Epoch 36/200 244/244 - 7s - loss: 2.9084 - accuracy: 0.4084 Epoch 37/200 244/244 - 7s - loss: 2.8380 - accuracy: 0.4180 Epoch 38/200 244/244 - 7s - loss: 2.7728 - accuracy: 0.4356 Epoch 39/200 244/244 - 7s - loss: 2.7109 - accuracy: 0.4409 Epoch 40/200 244/244 - 7s - loss: 2.6484 - accuracy: 0.4576 Epoch 41/200 244/244 - 7s - loss: 2.5914 - accuracy: 0.4641 Epoch 42/200 244/244 - 7s - loss: 2.5323 - accuracy: 0.4744 Epoch 43/200 244/244 - 7s - loss: 2.4806 - accuracy: 0.4889 Epoch 44/200 244/244 - 7s - loss: 2.4263 - accuracy: 0.4998 Epoch 45/200 244/244 - 8s - loss: 2.3710 - accuracy: 0.5116 Epoch 46/200 244/244 - 7s - loss: 2.3195 - accuracy: 0.5185 Epoch 47/200 244/244 - 7s - loss: 2.2706 - accuracy: 0.5290 Epoch 48/200 244/244 - 7s - loss: 2.2203 - accuracy: 0.5415 Epoch 49/200
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-16-50a8ce36e604> in <module>() 8 model.add(Dense(vocab_size, activation='softmax')) 9 model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) ---> 10 model.fit(X, y, epochs=200, verbose=2) /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs) 106 def _method_wrapper(self, *args, **kwargs): 107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access --> 108 return method(self, *args, **kwargs) 109 110 # Running inside `run_distribute_coordinator` already. /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing) 1096 batch_size=batch_size): 1097 callbacks.on_train_batch_begin(step) -> 1098 tmp_logs = train_function(iterator) 1099 if data_handler.should_sync: 1100 context.async_wait() /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds) 778 else: 779 compiler = "nonXla" --> 780 result = self._call(*args, **kwds) 781 782 new_tracing_count = self._get_tracing_count() /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds) 805 # In this case we have created variables on the first call, so we run the 806 # defunned version which is guaranteed to never create variables. --> 807 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable 808 elif self._stateful_fn is not None: 809 # Release the lock early so that multiple threads can perform the call /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs) 2827 with self._lock: 2828 graph_function, args, kwargs = self._maybe_define_function(args, kwargs) -> 2829 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access 2830 2831 @property /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _filtered_call(self, args, kwargs, cancellation_manager) 1846 resource_variable_ops.BaseResourceVariable))], 1847 captured_inputs=self.captured_inputs, -> 1848 cancellation_manager=cancellation_manager) 1849 1850 def _call_flat(self, args, captured_inputs, cancellation_manager=None): /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager) 1922 # No tape is watching; skip to running the function. 1923 return self._build_call_outputs(self._inference_function.call( -> 1924 ctx, args, cancellation_manager=cancellation_manager)) 1925 forward_backward = self._select_forward_and_backward_functions( 1926 args, /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager) 548 inputs=args, 549 attrs=attrs, --> 550 ctx=ctx) 551 else: 552 outputs = execute.execute_with_cancellation( /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name) 58 ctx.ensure_initialized() 59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name, ---> 60 inputs, attrs, num_outputs) 61 except core._NotOkStatusException as e: 62 if name is not None: KeyboardInterrupt:
def sentence_generation(model, t, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
init_word = current_word # 처음 들어온 단어도 마지막에 같이 출력하기위해 저장
sentence = ''
for _ in range(n): # n번 반복
encoded = t.texts_to_sequences([current_word])[0] # 현재 단어에 대한 정수 인코딩
encoded = pad_sequences([encoded], maxlen=23, padding='pre') # 데이터에 대한 패딩
result = model.predict_classes(encoded, verbose=0)
# 입력한 X(현재 단어)에 대해서 y를 예측하고 y(예측한 단어)를 result에 저장.
for word, index in t.word_index.items():
if index == result: # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면
break # 해당 단어가 예측 단어이므로 break
current_word = current_word + ' ' + word # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
sentence = sentence + ' ' + word # 예측 단어를 문장에 저장
# for문이므로 이 행동을 다시 반복
sentence = init_word + sentence
return sentence
print(sentence_generation(model, t, 'i', 10)) # 임의의 단어 'i'에 대해서 10개의 단어를 추가 생성
print(sentence_generation(model, t, 'how', 10)) # 임의의 단어 'how'에 대해서 10개의 단어를 추가 생성
i will threatens how as a marriage is a sudden plunge how the heck is that abou is a navy denied him
import numpy as np
import urllib.request
from tensorflow.keras.utils import to_categorical
urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", filename="11-0.txt")
f = open('11-0.txt', 'rb')
lines=[]
for line in f: # 데이터를 한 줄씩 읽는다.
line=line.strip() # strip()을 통해 \r, \n을 제거한다.
line=line.lower() # 소문자화.
line=line.decode('ascii', 'ignore') # \xe2\x80\x99 등과 같은 바이트 열 제거
if len(line) > 0:
lines.append(line)
f.close()
lines[:5]
['the project gutenberg ebook of alices adventures in wonderland, by lewis carroll', 'this ebook is for the use of anyone anywhere at no cost and with', 'almost no restrictions whatsoever. you may copy it, give it away or', 're-use it under the terms of the project gutenberg license included', 'with this ebook or online at www.gutenberg.org']
# 하나의 문자열로 통합
text = ' '.join(lines)
print('문자열의 길이 또는 총 글자의 개수: %d' % len(text))
print(text[:200])
문자열의 길이 또는 총 글자의 개수: 159612 the project gutenberg ebook of alices adventures in wonderland, by lewis carroll this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. you may copy it, g
# 글자집합 생성
char_vocab = sorted(list(set(text)))
vocab_size=len(char_vocab)
print ('글자 집합의 크기 : {}'.format(vocab_size))
글자 집합의 크기 : 57
char_to_index = dict((c, i) for i, c in enumerate(char_vocab)) # 글자에 고유한 정수 인덱스 부여
print(char_to_index)
{' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '@': 27, '[': 28, ']': 29, '_': 30, 'a': 31, 'b': 32, 'c': 33, 'd': 34, 'e': 35, 'f': 36, 'g': 37, 'h': 38, 'i': 39, 'j': 40, 'k': 41, 'l': 42, 'm': 43, 'n': 44, 'o': 45, 'p': 46, 'q': 47, 'r': 48, 's': 49, 't': 50, 'u': 51, 'v': 52, 'w': 53, 'x': 54, 'y': 55, 'z': 56}
# 결과값과 비교하기 위해 인덱스로 글자를 반환하는 index_to_char 생성
index_to_char={}
for key, value in char_to_index.items():
index_to_char[value] = key
# Example) 샘플의 길이가 4라면 4개의 입력 글자 시퀀스로 부터 4개의 출력 글자 시퀀스를 예측. 즉, RNN의 time step은 4번
appl -> pple
# appl은 train_X(입력 시퀀스), pple는 train_y(예측해야하는 시퀀스)에 저장한다.
seq_length = 60 # 문장의 길이를 60으로 한다.
n_samples = int(np.floor((len(text) - 1) / seq_length)) # 문자열을 60등분한다. 그러면 즉, 총 샘플의 개수
print ('문장 샘플의 수 : {}'.format(n_samples))
문장 샘플의 수 : 2660
train_X = []
train_y = []
for i in range(n_samples): # 2,646번 수행
X_sample = text[i * seq_length: (i + 1) * seq_length]
# 0:60 -> 60:120 -> 120:180로 loop를 돌면서 문장 샘플을 1개씩 가져온다.
X_encoded = [char_to_index[c] for c in X_sample] # 하나의 문장 샘플에 대해서 정수 인코딩
train_X.append(X_encoded)
y_sample = text[i * seq_length + 1: (i + 1) * seq_length + 1] # 오른쪽으로 1칸 쉬프트한다.
y_encoded = [char_to_index[c] for c in y_sample]
train_y.append(y_encoded)
print(train_X[0])
print(train_y[0])
[50, 38, 35, 0, 46, 48, 45, 40, 35, 33, 50, 0, 37, 51, 50, 35, 44, 32, 35, 48, 37, 0, 35, 32, 45, 45, 41, 0, 45, 36, 0, 31, 42, 39, 33, 35, 49, 0, 31, 34, 52, 35, 44, 50, 51, 48, 35, 49, 0, 39, 44, 0, 53, 45, 44, 34, 35, 48, 42, 31] [38, 35, 0, 46, 48, 45, 40, 35, 33, 50, 0, 37, 51, 50, 35, 44, 32, 35, 48, 37, 0, 35, 32, 45, 45, 41, 0, 45, 36, 0, 31, 42, 39, 33, 35, 49, 0, 31, 34, 52, 35, 44, 50, 51, 48, 35, 49, 0, 39, 44, 0, 53, 45, 44, 34, 35, 48, 42, 31, 44]
print(train_X[1])
print(train_y[1])
[44, 34, 10, 0, 32, 55, 0, 42, 35, 53, 39, 49, 0, 33, 31, 48, 48, 45, 42, 42, 0, 50, 38, 39, 49, 0, 35, 32, 45, 45, 41, 0, 39, 49, 0, 36, 45, 48, 0, 50, 38, 35, 0, 51, 49, 35, 0, 45, 36, 0, 31, 44, 55, 45, 44, 35, 0, 31, 44, 55] [34, 10, 0, 32, 55, 0, 42, 35, 53, 39, 49, 0, 33, 31, 48, 48, 45, 42, 42, 0, 50, 38, 39, 49, 0, 35, 32, 45, 45, 41, 0, 39, 49, 0, 36, 45, 48, 0, 50, 38, 35, 0, 51, 49, 35, 0, 45, 36, 0, 31, 44, 55, 45, 44, 35, 0, 31, 44, 55, 53]
# 원핫 인코딩
train_X = to_categorical(train_X)
train_y = to_categorical(train_y)
print('train_X의 크기(shape) : {}'.format(train_X.shape)) # 원-핫 인코딩
print('train_y의 크기(shape) : {}'.format(train_y.shape)) # 원-핫 인코딩
train_X의 크기(shape) : (2660, 60, 57) train_y의 크기(shape) : (2660, 60, 57)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed
model = Sequential()
model.add(LSTM(256, input_shape=(None, train_X.shape[2]), return_sequences=True))
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_y, epochs=80, verbose=2)
Epoch 1/80 84/84 - 47s - loss: 3.0822 - accuracy: 0.1806 Epoch 2/80 84/84 - 48s - loss: 2.7466 - accuracy: 0.2467 Epoch 3/80
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-12-3feed3cec92c> in <module>() 8 9 model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) ---> 10 model.fit(train_X, train_y, epochs=80, verbose=2) /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs) 106 def _method_wrapper(self, *args, **kwargs): 107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access --> 108 return method(self, *args, **kwargs) 109 110 # Running inside `run_distribute_coordinator` already. /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing) 1096 batch_size=batch_size): 1097 callbacks.on_train_batch_begin(step) -> 1098 tmp_logs = train_function(iterator) 1099 if data_handler.should_sync: 1100 context.async_wait() /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds) 778 else: 779 compiler = "nonXla" --> 780 result = self._call(*args, **kwds) 781 782 new_tracing_count = self._get_tracing_count() /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds) 805 # In this case we have created variables on the first call, so we run the 806 # defunned version which is guaranteed to never create variables. --> 807 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable 808 elif self._stateful_fn is not None: 809 # Release the lock early so that multiple threads can perform the call /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs) 2827 with self._lock: 2828 graph_function, args, kwargs = self._maybe_define_function(args, kwargs) -> 2829 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access 2830 2831 @property /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _filtered_call(self, args, kwargs, cancellation_manager) 1846 resource_variable_ops.BaseResourceVariable))], 1847 captured_inputs=self.captured_inputs, -> 1848 cancellation_manager=cancellation_manager) 1849 1850 def _call_flat(self, args, captured_inputs, cancellation_manager=None): /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager) 1922 # No tape is watching; skip to running the function. 1923 return self._build_call_outputs(self._inference_function.call( -> 1924 ctx, args, cancellation_manager=cancellation_manager)) 1925 forward_backward = self._select_forward_and_backward_functions( 1926 args, /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager) 548 inputs=args, 549 attrs=attrs, --> 550 ctx=ctx) 551 else: 552 outputs = execute.execute_with_cancellation( /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name) 58 ctx.ensure_initialized() 59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name, ---> 60 inputs, attrs, num_outputs) 61 except core._NotOkStatusException as e: 62 if name is not None: KeyboardInterrupt:
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= lstm (LSTM) (None, None, 256) 321536 _________________________________________________________________ lstm_1 (LSTM) (None, None, 256) 525312 _________________________________________________________________ time_distributed (TimeDistri (None, None, 57) 14649 ================================================================= Total params: 861,497 Trainable params: 861,497 Non-trainable params: 0 _________________________________________________________________
def sentence_generation(model, length):
ix = [np.random.randint(vocab_size)] # 글자에 대한 랜덤 인덱스 생성
y_char = [index_to_char[ix[-1]]] # 랜덤 익덱스로부터 글자 생성
print(ix[-1],'번 글자',y_char[-1],'로 예측을 시작!')
X = np.zeros((1, length, vocab_size)) # (1, length, 55) 크기의 X 생성. 즉, LSTM의 입력 시퀀스 생성
for i in range(length):
X[0][i][ix[-1]] = 1 # X[0][i][예측한 글자의 인덱스] = 1, 즉, 예측 글자를 다음 입력 시퀀스에 추가
print(index_to_char[ix[-1]], end="")
ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
y_char.append(index_to_char[ix[-1]])
return ('').join(y_char)
sentence_generation(model, 100)
37 번 글자 g 로 예측을 시작! g at the the the the the the the the the the t
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-14-084bbdc7c9ea> in <module>() 12 return ('').join(y_char) 13 ---> 14 sentence_generation(model, 100) <ipython-input-14-084bbdc7c9ea> in sentence_generation(model, length) 8 X[0][i][ix[-1]] = 1 # X[0][i][예측한 글자의 인덱스] = 1, 즉, 예측 글자를 다음 입력 시퀀스에 추가 9 print(index_to_char[ix[-1]], end="") ---> 10 ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1) 11 y_char.append(index_to_char[ix[-1]]) 12 return ('').join(y_char) /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs) 128 raise ValueError('{} is not supported in multi-worker mode.'.format( 129 method.__name__)) --> 130 return method(self, *args, **kwargs) 131 132 return tf_decorator.make_decorator( /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in predict(self, x, batch_size, verbose, steps, callbacks, max_queue_size, workers, use_multiprocessing) 1597 for step in data_handler.steps(): 1598 callbacks.on_predict_batch_begin(step) -> 1599 tmp_batch_outputs = predict_function(iterator) 1600 if data_handler.should_sync: 1601 context.async_wait() /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds) 778 else: 779 compiler = "nonXla" --> 780 result = self._call(*args, **kwds) 781 782 new_tracing_count = self._get_tracing_count() /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds) 812 # In this case we have not created variables on the first call. So we can 813 # run the first trace but we should fail if variables are created. --> 814 results = self._stateful_fn(*args, **kwds) 815 if self._created_variables: 816 raise ValueError("Creating variables on a non-first call to a function" /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs) 2827 with self._lock: 2828 graph_function, args, kwargs = self._maybe_define_function(args, kwargs) -> 2829 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access 2830 2831 @property /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _filtered_call(self, args, kwargs, cancellation_manager) 1846 resource_variable_ops.BaseResourceVariable))], 1847 captured_inputs=self.captured_inputs, -> 1848 cancellation_manager=cancellation_manager) 1849 1850 def _call_flat(self, args, captured_inputs, cancellation_manager=None): /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager) 1922 # No tape is watching; skip to running the function. 1923 return self._build_call_outputs(self._inference_function.call( -> 1924 ctx, args, cancellation_manager=cancellation_manager)) 1925 forward_backward = self._select_forward_and_backward_functions( 1926 args, /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager) 548 inputs=args, 549 attrs=attrs, --> 550 ctx=ctx) 551 else: 552 outputs = execute.execute_with_cancellation( /usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name) 58 ctx.ensure_initialized() 59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name, ---> 60 inputs, attrs, num_outputs) 61 except core._NotOkStatusException as e: 62 if name is not None: KeyboardInterrupt:
import numpy as np
from tensorflow.keras.utils import to_categorical
text='''
I get on with life as a programmer,
I like to contemplate beer.
But when I start to daydream,
My mind turns straight to wine.
Do I love wine more than beer?
I like to use words about beer.
But when I stop my talking,
My mind turns straight to wine.
I hate bugs and errors.
But I just think back to wine,
And I'm happy once again.
I like to hang out with programming and deep learning.
But when left alone,
My mind turns straight to wine.
'''
tokens = text.split() # '\n 제거'
text = ' '.join(tokens)
print(text)
I get on with life as a programmer, I like to contemplate beer. But when I start to daydream, My mind turns straight to wine. Do I love wine more than beer? I like to use words about beer. But when I stop my talking, My mind turns straight to wine. I hate bugs and errors. But I just think back to wine, And I'm happy once again. I like to hang out with programming and deep learning. But when left alone, My mind turns straight to wine.
char_vocab = sorted(list(set(text))) # 중복을 제거한 글자 집합 생성
print(char_vocab)
vocab_size=len(char_vocab)
print ('글자 집합의 크기 : {}'.format(vocab_size))
[' ', "'", ',', '.', '?', 'A', 'B', 'D', 'I', 'M', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y'] 글자 집합의 크기 : 33
char_to_index = dict((c, i) for i, c in enumerate(char_vocab)) # 글자에 고유한 정수 인덱스 부여
print(char_to_index)
{' ': 0, "'": 1, ',': 2, '.': 3, '?': 4, 'A': 5, 'B': 6, 'D': 7, 'I': 8, 'M': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'r': 26, 's': 27, 't': 28, 'u': 29, 'v': 30, 'w': 31, 'y': 32}
# Example) 5개의 입력 글자 시퀀스로부터 다음 글자 시퀀스를 예측. 즉, RNN의 time step은 5번
stude -> n
tuden -> t
length = 11
sequences = []
for i in range(length, len(text)):
seq = text[i-length:i] # 길이 11의 문자열을 지속적으로 만든다.
sequences.append(seq)
print('총 훈련 샘플의 수: %d' % len(sequences))
총 훈련 샘플의 수: 426
print(sequences[:10])
print(sequences[30:45])
['I get on wi', ' get on wit', 'get on with', 'et on with ', 't on with l', ' on with li', 'on with lif', 'n with life', ' with life ', 'with life a'] ['mmer, I lik', 'mer, I like', 'er, I like ', 'r, I like t', ', I like to', ' I like to ', 'I like to c', ' like to co', 'like to con', 'ike to cont', 'ke to conte', 'e to contem', ' to contemp', 'to contempl', 'o contempla']
X = []
for line in sequences: # 전체 데이터에서 문장 샘플을 1개씩 꺼낸다.
temp_X = [char_to_index[char] for char in line] # 문장 샘플에서 각 글자에 대해서 정수 인코딩을 수행.
X.append(temp_X)
for line in X[:5]:
print(line)
[8, 0, 16, 14, 28, 0, 24, 23, 0, 31, 18] [0, 16, 14, 28, 0, 24, 23, 0, 31, 18, 28] [16, 14, 28, 0, 24, 23, 0, 31, 18, 28, 17] [14, 28, 0, 24, 23, 0, 31, 18, 28, 17, 0] [28, 0, 24, 23, 0, 31, 18, 28, 17, 0, 21]
# 예측 단어 분리 (맨 마지막 글자 분리)
sequences = np.array(X)
X = sequences[:,:-1]
y = sequences[:,-1] # 맨 마지막 위치의 글자를 분리
for line in X[:5]:
print(line)
[ 8 0 16 14 28 0 24 23 0 31] [ 0 16 14 28 0 24 23 0 31 18] [16 14 28 0 24 23 0 31 18 28] [14 28 0 24 23 0 31 18 28 17] [28 0 24 23 0 31 18 28 17 0]
# 원핫 인코딩
sequences = [to_categorical(x, num_classes=vocab_size) for x in X] # X에 대한 원-핫 인코딩
X = np.array(sequences)
y = to_categorical(y, num_classes=vocab_size) # y에 대한 원-핫 인코딩
print(X.shape)
(426, 10, 33)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
model = Sequential()
model.add(LSTM(80, input_shape=(X.shape[1], X.shape[2]))) # X.shape[1]은 25, X.shape[2]는 33
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=2)
Epoch 1/100 14/14 - 0s - loss: 3.4580 - accuracy: 0.1432 Epoch 2/100 14/14 - 0s - loss: 3.2984 - accuracy: 0.1972 Epoch 3/100 14/14 - 0s - loss: 3.0382 - accuracy: 0.1972 Epoch 4/100 14/14 - 0s - loss: 2.9666 - accuracy: 0.1972 Epoch 5/100 14/14 - 0s - loss: 2.9433 - accuracy: 0.1972 Epoch 6/100 14/14 - 0s - loss: 2.9227 - accuracy: 0.1972 Epoch 7/100 14/14 - 0s - loss: 2.9052 - accuracy: 0.1972 Epoch 8/100 14/14 - 0s - loss: 2.8916 - accuracy: 0.1972 Epoch 9/100 14/14 - 0s - loss: 2.8678 - accuracy: 0.1995 Epoch 10/100 14/14 - 0s - loss: 2.8377 - accuracy: 0.1972 Epoch 11/100 14/14 - 0s - loss: 2.8018 - accuracy: 0.2089 Epoch 12/100 14/14 - 0s - loss: 2.7741 - accuracy: 0.2230 Epoch 13/100 14/14 - 0s - loss: 2.7372 - accuracy: 0.2089 Epoch 14/100 14/14 - 0s - loss: 2.6775 - accuracy: 0.2418 Epoch 15/100 14/14 - 0s - loss: 2.6518 - accuracy: 0.2512 Epoch 16/100 14/14 - 0s - loss: 2.5878 - accuracy: 0.2488 Epoch 17/100 14/14 - 0s - loss: 2.5182 - accuracy: 0.2723 Epoch 18/100 14/14 - 0s - loss: 2.4703 - accuracy: 0.2981 Epoch 19/100 14/14 - 0s - loss: 2.4139 - accuracy: 0.2981 Epoch 20/100 14/14 - 0s - loss: 2.3649 - accuracy: 0.3099 Epoch 21/100 14/14 - 0s - loss: 2.3031 - accuracy: 0.3404 Epoch 22/100 14/14 - 0s - loss: 2.2510 - accuracy: 0.3451 Epoch 23/100 14/14 - 0s - loss: 2.1971 - accuracy: 0.3709 Epoch 24/100 14/14 - 0s - loss: 2.1257 - accuracy: 0.3826 Epoch 25/100 14/14 - 0s - loss: 2.0661 - accuracy: 0.3850 Epoch 26/100 14/14 - 0s - loss: 2.0337 - accuracy: 0.4085 Epoch 27/100 14/14 - 0s - loss: 1.9839 - accuracy: 0.4554 Epoch 28/100 14/14 - 0s - loss: 1.9434 - accuracy: 0.4296 Epoch 29/100 14/14 - 0s - loss: 1.9031 - accuracy: 0.4366 Epoch 30/100 14/14 - 0s - loss: 1.8332 - accuracy: 0.5000 Epoch 31/100 14/14 - 0s - loss: 1.7705 - accuracy: 0.4977 Epoch 32/100 14/14 - 0s - loss: 1.7159 - accuracy: 0.5164 Epoch 33/100 14/14 - 0s - loss: 1.6726 - accuracy: 0.5423 Epoch 34/100 14/14 - 0s - loss: 1.6274 - accuracy: 0.5540 Epoch 35/100 14/14 - 0s - loss: 1.5915 - accuracy: 0.5986 Epoch 36/100 14/14 - 0s - loss: 1.5312 - accuracy: 0.5986 Epoch 37/100 14/14 - 0s - loss: 1.5075 - accuracy: 0.5986 Epoch 38/100 14/14 - 0s - loss: 1.4553 - accuracy: 0.6268 Epoch 39/100 14/14 - 0s - loss: 1.3836 - accuracy: 0.6455 Epoch 40/100 14/14 - 0s - loss: 1.3247 - accuracy: 0.6901 Epoch 41/100 14/14 - 0s - loss: 1.2974 - accuracy: 0.6878 Epoch 42/100 14/14 - 0s - loss: 1.2430 - accuracy: 0.7183 Epoch 43/100 14/14 - 0s - loss: 1.2240 - accuracy: 0.7300 Epoch 44/100 14/14 - 0s - loss: 1.1928 - accuracy: 0.7277 Epoch 45/100 14/14 - 0s - loss: 1.1447 - accuracy: 0.7347 Epoch 46/100 14/14 - 0s - loss: 1.0965 - accuracy: 0.7582 Epoch 47/100 14/14 - 0s - loss: 1.0653 - accuracy: 0.7559 Epoch 48/100 14/14 - 0s - loss: 1.0373 - accuracy: 0.7512 Epoch 49/100 14/14 - 0s - loss: 1.0040 - accuracy: 0.7770 Epoch 50/100 14/14 - 0s - loss: 0.9708 - accuracy: 0.7793 Epoch 51/100 14/14 - 0s - loss: 0.9332 - accuracy: 0.7700 Epoch 52/100 14/14 - 0s - loss: 0.8950 - accuracy: 0.8052 Epoch 53/100 14/14 - 0s - loss: 0.8532 - accuracy: 0.8122 Epoch 54/100 14/14 - 0s - loss: 0.8142 - accuracy: 0.8263 Epoch 55/100 14/14 - 0s - loss: 0.7963 - accuracy: 0.8592 Epoch 56/100 14/14 - 0s - loss: 0.7762 - accuracy: 0.8310 Epoch 57/100 14/14 - 0s - loss: 0.7459 - accuracy: 0.8568 Epoch 58/100 14/14 - 0s - loss: 0.7013 - accuracy: 0.8638 Epoch 59/100 14/14 - 0s - loss: 0.6754 - accuracy: 0.8803 Epoch 60/100 14/14 - 0s - loss: 0.6576 - accuracy: 0.8779 Epoch 61/100 14/14 - 0s - loss: 0.6340 - accuracy: 0.9014 Epoch 62/100 14/14 - 0s - loss: 0.6083 - accuracy: 0.9038 Epoch 63/100 14/14 - 0s - loss: 0.5740 - accuracy: 0.9108 Epoch 64/100 14/14 - 0s - loss: 0.5576 - accuracy: 0.9178 Epoch 65/100 14/14 - 0s - loss: 0.5215 - accuracy: 0.9319 Epoch 66/100 14/14 - 0s - loss: 0.5031 - accuracy: 0.9319 Epoch 67/100 14/14 - 0s - loss: 0.4847 - accuracy: 0.9437 Epoch 68/100 14/14 - 0s - loss: 0.4747 - accuracy: 0.9390 Epoch 69/100 14/14 - 0s - loss: 0.4512 - accuracy: 0.9413 Epoch 70/100 14/14 - 0s - loss: 0.4289 - accuracy: 0.9554 Epoch 71/100 14/14 - 0s - loss: 0.4233 - accuracy: 0.9531 Epoch 72/100 14/14 - 0s - loss: 0.4173 - accuracy: 0.9531 Epoch 73/100 14/14 - 0s - loss: 0.4016 - accuracy: 0.9531 Epoch 74/100 14/14 - 0s - loss: 0.3682 - accuracy: 0.9601 Epoch 75/100 14/14 - 0s - loss: 0.3535 - accuracy: 0.9671 Epoch 76/100 14/14 - 0s - loss: 0.3401 - accuracy: 0.9742 Epoch 77/100 14/14 - 0s - loss: 0.3206 - accuracy: 0.9765 Epoch 78/100 14/14 - 0s - loss: 0.3207 - accuracy: 0.9671 Epoch 79/100 14/14 - 0s - loss: 0.3217 - accuracy: 0.9624 Epoch 80/100 14/14 - 0s - loss: 0.3150 - accuracy: 0.9671 Epoch 81/100 14/14 - 0s - loss: 0.2963 - accuracy: 0.9718 Epoch 82/100 14/14 - 0s - loss: 0.2772 - accuracy: 0.9695 Epoch 83/100 14/14 - 0s - loss: 0.2687 - accuracy: 0.9859 Epoch 84/100 14/14 - 0s - loss: 0.2537 - accuracy: 0.9765 Epoch 85/100 14/14 - 0s - loss: 0.2457 - accuracy: 0.9789 Epoch 86/100 14/14 - 0s - loss: 0.2300 - accuracy: 0.9836 Epoch 87/100 14/14 - 0s - loss: 0.2311 - accuracy: 0.9836 Epoch 88/100 14/14 - 0s - loss: 0.2207 - accuracy: 0.9742 Epoch 89/100 14/14 - 0s - loss: 0.2143 - accuracy: 0.9789 Epoch 90/100 14/14 - 0s - loss: 0.2068 - accuracy: 0.9836 Epoch 91/100 14/14 - 0s - loss: 0.2104 - accuracy: 0.9789 Epoch 92/100 14/14 - 0s - loss: 0.1982 - accuracy: 0.9812 Epoch 93/100 14/14 - 0s - loss: 0.1929 - accuracy: 0.9812 Epoch 94/100 14/14 - 0s - loss: 0.1829 - accuracy: 0.9836 Epoch 95/100 14/14 - 0s - loss: 0.1757 - accuracy: 0.9789 Epoch 96/100 14/14 - 0s - loss: 0.1697 - accuracy: 0.9789 Epoch 97/100 14/14 - 0s - loss: 0.1673 - accuracy: 0.9836 Epoch 98/100 14/14 - 0s - loss: 0.1645 - accuracy: 0.9789 Epoch 99/100 14/14 - 0s - loss: 0.1697 - accuracy: 0.9742 Epoch 100/100 14/14 - 0s - loss: 0.1615 - accuracy: 0.9812
<tensorflow.python.keras.callbacks.History at 0x7ff41dc2df60>
def sentence_generation(model, char_to_index, seq_length, seed_text, n):
# 모델, 인덱스 정보, 문장 길이, 초기 시퀀스, 반복 횟수
init_text = seed_text # 문장 생성에 사용할 초기 시퀀스
sentence = ''
for _ in range(n): # n번 반복
encoded = [char_to_index[char] for char in seed_text] # 현재 시퀀스에 대한 정수 인코딩
encoded = pad_sequences([encoded], maxlen=seq_length, padding='pre') # 데이터에 대한 패딩
encoded = to_categorical(encoded, num_classes=len(char_to_index))
result = model.predict_classes(encoded, verbose=0)
# 입력한 X(현재 시퀀스)에 대해서 y를 예측하고 y(예측한 글자)를 result에 저장.
for char, index in char_to_index.items(): # 만약 예측한 글자와 인덱스와 동일한 글자가 있다면
if index == result: # 해당 글자가 예측 글자이므로 break
break
seed_text=seed_text + char # 현재 시퀀스 + 예측 글자를 현재 시퀀스로 변경
sentence=sentence + char # 예측 글자를 문장에 저장
# for문이므로 이 작업을 다시 반복
sentence = init_text + sentence
return sentence
print(sentence_generation(model, char_to_index, 10, 'I get on w', 80))
I get on with life as a programmer, I like to use words about beer. But when I stap mto da
원핫벡터: 희소벡터
문제점: 단어 개수가 늘어나면 벡터의 차원이 한없이 커진다는 것.
0과 1이 아닌 실수값을 가지게 됨.
ex) 강아지 = [0 0 0 0 0 1 0 0 0 ...] (원핫벡터)
-> 강아지 = [0.2 1.8 1.1 -2.1 1.1 ...]
이처럼 벡터의 차원이 조밀해졌다고 하여 밀집벡터라고 함.
ex) "The fat cat sat on the mat"
중심단어를 예측하기 위해 앞,뒤로 몇 개의 단어를 볼지 결정했다면, 이 범위를 윈도우라고 한다. 윈도우 크기가 2이고, 중심단어가 sat이라고 한다면, 앞의 두 단어인 fat, cat 뒤의 두 단어인 on, the를 참고함.
윈도우 크기를 정한 이후에는 윈도우를 계속 움직여 주변단어와 중심단어 선택을 바꿔가며 학습데이터셋을 만들 수 있음. 이 방법을 슬라이딩 윈도우라고 함.
Word2vec의 입력은 모두 원-핫 벡터가 되어야 함.
word2vec은 딥러닝 모델이 아님. (딥러닝모델: 입력층과 출력층 사이 은닉층의 개수가 충분히 쌓여있는 신경망. word2vec은 하나의 은닉층만이 존재하며 활성화함수가 존재하지 않음.)
중간에 있는 단어로 주변 단어를 예측하는 방법.
전반적으로 CBOW보다 성능이 좋다고 알려져 있음.
하지만 단어집합의 크기가 굉장히 커진다면, 무거운 작업이 됨.
Word2vec은 모든 단어집합에 대해 소프트맥스 함수를 수행하고, 역전파를 수행하므로 주변 단어와 상관없는 모든 단어까지 워드임베딩 조정 작업을 수행함.
import re
from lxml import etree
import urllib.request
import zipfile
from nltk.tokenize import word_tokenize, sent_tokenize
urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")
# 데이터 다운로드
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
target_text = etree.parse(z.open('ted_en-20160408.xml', 'r'))
parse_text = '\n'.join(target_text.xpath('//content/text()'))
# xml 파일로부터 <content>와 </content> 사이의 내용만 가져온다.
parse_text[:300]
"Here are two reasons companies fail: they only do more of the same, or they only do what's new.\nTo me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but it can be too much of a good thing.\nConsider Facit"
import nltk
nltk.download('punkt')
content_text = re.sub(r'\([^)]*\)', '', parse_text)
# 정규 표현식의 sub 모듈을 통해 content 중간에 등장하는 (Audio), (Laughter) 등의 배경음 부분을 제거.
# 해당 코드는 괄호로 구성된 내용을 제거.
sent_text=sent_tokenize(content_text)
# 입력 코퍼스에 대해서 NLTK를 이용하여 문장 토큰화를 수행.
normalized_text = []
for string in sent_text:
tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
normalized_text.append(tokens)
# 각 문장에 대해서 구두점을 제거하고, 대문자를 소문자로 변환.
result = [word_tokenize(sentence) for sentence in normalized_text]
# 각 문장에 대해서 NLTK를 이용하여 단어 토큰화를 수행.
print('총 샘플의 개수: {}'.format(len(result)))
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip. 총 샘플의 개수: 273424
for line in result[:3]: # 샘플 3개만 출력
print(line)
['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'] ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation'] ['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing']
from gensim.models import Word2Vec
model = Word2Vec(sentences=result, size=100, window=5, min_count=5, workers=4, sg=0)
model_result = model.wv.most_similar("man")
print(model_result)
[('woman', 0.8496309518814087), ('guy', 0.8047130107879639), ('lady', 0.7630440592765808), ('boy', 0.7434742450714111), ('girl', 0.7397940158843994), ('poet', 0.7102178335189819), ('gentleman', 0.6967080235481262), ('kid', 0.6860352754592896), ('soldier', 0.6833100318908691), ('surgeon', 0.6762808561325073)]
/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
from gensim.models import KeyedVectors
model.wv.save_word2vec_format('eng_w2v') # 모델 저장
loaded_model = KeyedVectors.load_word2vec_format("eng_w2v") # 모델 로드
/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:254: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
model_result = loaded_model.most_similar("man")
print(model_result)
[('woman', 0.8496309518814087), ('guy', 0.8047130107879639), ('lady', 0.7630440592765808), ('boy', 0.7434742450714111), ('girl', 0.7397940158843994), ('poet', 0.7102178335189819), ('gentleman', 0.6967080235481262), ('kid', 0.6860352754592896), ('soldier', 0.6833100318908691), ('surgeon', 0.6762808561325073)]
/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
!pip install konlpy
Collecting konlpy
Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
|████████████████████████████████| 19.4MB 1.9MB/s
Collecting beautifulsoup4==4.6.0
Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
|████████████████████████████████| 92kB 8.9MB/s
Collecting JPype1>=0.7.0
Downloading https://files.pythonhosted.org/packages/8b/f7/a368401e630f0e390dd0e62c39fb928e5b23741b53c2360ee7d376660927/JPype1-1.0.2-cp36-cp36m-manylinux2010_x86_64.whl (3.8MB)
|████████████████████████████████| 3.8MB 45.3MB/s
Requirement already satisfied: lxml>=4.1.0 in /usr/local/lib/python3.6/dist-packages (from konlpy) (4.2.6)
Collecting tweepy>=3.7.0
Downloading https://files.pythonhosted.org/packages/bb/7c/99d51f80f3b77b107ebae2634108717362c059a41384a1810d13e2429a81/tweepy-3.9.0-py2.py3-none-any.whl
Requirement already satisfied: numpy>=1.6 in /usr/local/lib/python3.6/dist-packages (from konlpy) (1.18.5)
Collecting colorama
Downloading https://files.pythonhosted.org/packages/c9/dc/45cdef1b4d119eb96316b3117e6d5708a08029992b2fee2c143c7a0a5cc5/colorama-0.4.3-py2.py3-none-any.whl
Requirement already satisfied: typing-extensions; python_version < "3.8" in /usr/local/lib/python3.6/dist-packages (from JPype1>=0.7.0->konlpy) (3.7.4.2)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (1.15.0)
Requirement already satisfied: requests[socks]>=2.11.1 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (2.23.0)
Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (1.3.0)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2.10)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.24.3)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2020.6.20)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6; extra == "socks" in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.7.1)
Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->tweepy>=3.7.0->konlpy) (3.1.0)
Installing collected packages: beautifulsoup4, JPype1, tweepy, colorama, konlpy
Found existing installation: beautifulsoup4 4.6.3
Uninstalling beautifulsoup4-4.6.3:
Successfully uninstalled beautifulsoup4-4.6.3
Found existing installation: tweepy 3.6.0
Uninstalling tweepy-3.6.0:
Successfully uninstalled tweepy-3.6.0
Successfully installed JPype1-1.0.2 beautifulsoup4-4.6.0 colorama-0.4.3 konlpy-0.5.2 tweepy-3.9.0
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
from gensim.models.word2vec import Word2Vec
from konlpy.tag import Okt
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")
train_data = pd.read_table('ratings.txt')
train_data[:5]
| id | document | label | |
|---|---|---|---|
| 0 | 8112052 | 어릴때보고 지금다시봐도 재밌어요ㅋㅋ | 1 |
| 1 | 8132799 | 디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산... | 1 |
| 2 | 4655635 | 폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고. | 1 |
| 3 | 9251303 | 와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런... | 1 |
| 4 | 10067386 | 안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화. | 1 |
print(len(train_data)) # 리뷰 개수 출력
# NULL 값 존재 유무
print(train_data.isnull().values.any())
train_data = train_data.dropna(how = 'any') # Null 값이 존재하는 행 제거
print(train_data.isnull().values.any()) # Null 값이 존재하는지 확인
print(len(train_data)) # 리뷰 개수 출력
# 정규 표현식을 통한 한글 외 문자 제거
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_data[:5] # 상위 5개 출력
200000 True False 199992
| id | document | label | |
|---|---|---|---|
| 0 | 8112052 | 어릴때보고 지금다시봐도 재밌어요ㅋㅋ | 1 |
| 1 | 8132799 | 디자인을 배우는 학생으로 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산업... | 1 |
| 2 | 4655635 | 폴리스스토리 시리즈는 부터 뉴까지 버릴께 하나도 없음 최고 | 1 |
| 3 | 9251303 | 와 연기가 진짜 개쩔구나 지루할거라고 생각했는데 몰입해서 봤다 그래 이런게 진짜 영화지 | 1 |
| 4 | 10067386 | 안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화 | 1 |
# 불용어 정의
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
# 형태소 분석기 OKT를 사용한 토큰화 작업 (다소 시간 소요)
okt = Okt()
tokenized_data = []
for sentence in train_data['document']:
temp_X = okt.morphs(sentence, stem=True) # 토큰화
temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
tokenized_data.append(temp_X)
# 리뷰 길이 분포 확인
print('리뷰의 최대 길이 :',max(len(l) for l in tokenized_data))
print('리뷰의 평균 길이 :',sum(map(len, tokenized_data))/len(tokenized_data))
plt.hist([len(s) for s in tokenized_data], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()
from gensim.models import Word2Vec
model = Word2Vec(sentences = tokenized_data, size = 100, window = 5, min_count = 5, workers = 4, sg = 0)
# 완성된 임베딩 매트릭스의 크기 확인
model.wv.vectors.shape
print(model.wv.most_similar("최민식"))
print(model.wv.most_similar("히어로"))
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print('총 샘플 수 :',len(documents))
Downloading 20news dataset. This may take a few minutes. Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
총 샘플 수 : 11314
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())
# null value 확인
news_df.isnull().values.any()
False
# empty value 확인
news_df.replace("", float("NaN"), inplace=True)
news_df.isnull().values.any()
True
# null값 제거
news_df.dropna(inplace=True)
print('총 샘플 수 :',len(news_df))
총 샘플 수 : 10995
import nltk
nltk.download('stopwords')
# 불용어를 제거
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
tokenized_doc = tokenized_doc.to_list()
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
# 단어가 1개 이하인 샘플의 인덱스를 찾아서 저장하고, 해당 샘플들은 제거.
drop_train = [index for index, sentence in enumerate(tokenized_doc) if len(sentence) <= 1]
tokenized_doc = np.delete(tokenized_doc, drop_train, axis=0)
print('총 샘플 수 :',len(tokenized_doc))
총 샘플 수 : 10940
# 정수인코딩
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
encoded = tokenizer.texts_to_sequences(tokenized_doc)
print(encoded[:2])
[[9, 59, 603, 207, 3278, 1495, 474, 702, 9470, 13686, 5533, 15227, 702, 442, 702, 70, 1148, 1095, 1036, 20294, 984, 705, 4294, 702, 217, 207, 1979, 15228, 13686, 4865, 4520, 87, 1530, 6, 52, 149, 581, 661, 4406, 4988, 4866, 1920, 755, 10668, 1102, 7837, 442, 957, 10669, 634, 51, 228, 2669, 4989, 178, 66, 222, 4521, 6066, 68, 4295], [1026, 532, 2, 60, 98, 582, 107, 800, 23, 79, 4522, 333, 7838, 864, 421, 3825, 458, 6488, 458, 2700, 4730, 333, 23, 9, 4731, 7262, 186, 310, 146, 170, 642, 1260, 107, 33568, 13, 985, 33569, 33570, 9471, 11491]]
vocab_size = len(word2idx) + 1
print('단어 집합의 크기 :', vocab_size)
단어 집합의 크기 : 64277
from tensorflow.keras.preprocessing.sequence import skipgrams
# 네거티브 샘플링
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded[:10]]
# 첫번째 샘플인 skip_grams[0] 내 skipgrams로 형성된 데이터셋 확인
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
idx2word[pairs[i][0]], pairs[i][0],
idx2word[pairs[i][1]], pairs[i][1],
labels[i]))
(least (87), wefl (54068)) -> 0 (government (51), look (66)) -> 1 (degree (1530), magaziner (39612)) -> 0 (subsidizing (15228), least (87)) -> 1 (europeans (4520), subsidizing (15228)) -> 1
print('전체 샘플 수 :',len(skip_grams))
전체 샘플 수 : 10
# 첫번째 뉴스그룹 샘플에 대해서 생긴 pairs와 labels의 개수
print(len(pairs))
print(len(labels))
2220 2220
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded]
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input
from tensorflow.keras.layers import Dot
from tensorflow.keras.utils import plot_model
from IPython.display import SVG
embed_size = 100
# 모델 설계
# 중심 단어를 위한 임베딩 테이블
w_inputs = Input(shape=(1, ), dtype='int32')
word_embedding = Embedding(vocab_size, embed_size)(w_inputs)
# 주변 단어를 위한 임베딩 테이블
c_inputs = Input(shape=(1, ), dtype='int32')
context_embedding = Embedding(vocab_size, embed_size)(c_inputs)
# 임베딩테이블 생성, 시그모이드->활성화함수->예측값
dot_product = Dot(axes=2)([word_embedding, context_embedding])
dot_product = Reshape((1,), input_shape=(1, 1))(dot_product)
output = Activation('sigmoid')(dot_product)
model = Model(inputs=[w_inputs, c_inputs], outputs=output)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam')
plot_model(model, to_file='model3.png', show_shapes=True, show_layer_names=True, rankdir='TB')
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
input_2 (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
embedding (Embedding) (None, 1, 100) 6427700 input_1[0][0]
__________________________________________________________________________________________________
embedding_1 (Embedding) (None, 1, 100) 6427700 input_2[0][0]
__________________________________________________________________________________________________
dot (Dot) (None, 1, 1) 0 embedding[0][0]
embedding_1[0][0]
__________________________________________________________________________________________________
reshape (Reshape) (None, 1) 0 dot[0][0]
__________________________________________________________________________________________________
activation (Activation) (None, 1) 0 reshape[0][0]
==================================================================================================
Total params: 12,855,400
Trainable params: 12,855,400
Non-trainable params: 0
__________________________________________________________________________________________________
for epoch in range(1, 6):
loss = 0
for _, elem in enumerate(skip_grams):
first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
labels = np.array(elem[1], dtype='int32')
X = [first_elem, second_elem]
Y = labels
loss += model.train_on_batch(X,Y)
print('Epoch :',epoch, 'Loss :',loss)
import gensim
f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)
w2v.most_similar(positive=['soldiers'])
w2v.most_similar(positive=['doctor'])
w2v.most_similar(positive=['police'])
w2v.most_similar(positive=['knife'])
w2v.most_similar(positive=['engine'])
LSA는 DTM이나 TF-IDF 행렬과 같이 각 문서에서의 각 단어의 빈도수를 카운트 한 행렬이라는 전체적인 통계 정보를 입력으로 받아 차원을 축소(Truncated SVD)하여 잠재된 의미를 끌어내는 방법론. 반면, Word2Vec는 실제값과 예측값에 대한 오차를 손실 함수를 통해 줄여나가며 학습하는 예측 기반의 방법론.
LSA는 카운트 기반으로 코퍼스의 전체적인 통계 정보를 고려하기는 하지만, 왕:남자 = 여왕:? (정답은 여자)와 같은 단어 의미의 유추 작업(Analogy task)에는 성능이 떨어진다. Word2Vec는 예측 기반으로 단어 간 유추 작업에는 LSA보다 뛰어나지만, 임베딩 벡터가 윈도우 크기 내에서만 주변 단어를 고려하기 때문에 코퍼스의 전체적인 통계 정보를 반영하지 못함.
GloVe는 이러한 기존 방법론들의 각각의 한계를 지적하며, LSA의 메커니즘이었던 카운트 기반의 방법과 Word2Vec의 메커니즘이었던 예측 기반의 방법론 두 가지를 모두 사용함.
P(k | i)는 특정 단어 i의 전체 등장 횟수를 카운트하고 i가 등장했을 때 어떤 단어 k가 등장한 횟수를 카운트하여 계산한 조건부확률임.
P(k | i)에서 i를 중심 단어(Center Word), k를 주변 단어(Context Word)라고 했을 때, 위에서 배운 동시 등장 행렬에서 중심 단어 i의 행의 모든 값을 더한 값을 분모로 하고 i행 k열의 값을 분자로 한 값.
임베딩 층의 입력으로 사용하기 위해서는 입력시퀀스의 각 단어들이 모두 정수 인코딩 되어있어야 합니다.
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1]
t = Tokenizer()
t.fit_on_texts(sentences)
vocab_size = len(t.word_index) + 1
print(vocab_size)
X_encoded = t.texts_to_sequences(sentences)
print(X_encoded)
16 [[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]
# 정수인코딩
max_len=max(len(l) for l in X_encoded)
print(max_len)
X_train=pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_train=np.array(y_train)
print(X_train)
4 [[ 1 2 3 4] [ 5 6 0 0] [ 7 8 0 0] [ 9 10 0 0] [11 12 0 0] [13 0 0 0] [14 15 0 0]]
# 모델 설계
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
model = Sequential()
model.add(Embedding(vocab_size, 4, input_length=max_len)) # 모든 임베딩 벡터는 4차원.
model.add(Flatten()) # Dense의 입력으로 넣기위함.
model.add(Dense(1, activation='sigmoid'))
# 시그모이드를 활성화함수로 사용, 이진분류 수행
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)
Epoch 1/100 1/1 - 0s - loss: 0.6994 - acc: 0.4286 Epoch 2/100 1/1 - 0s - loss: 0.6979 - acc: 0.4286 Epoch 3/100 1/1 - 0s - loss: 0.6965 - acc: 0.5714 Epoch 4/100 1/1 - 0s - loss: 0.6951 - acc: 0.5714 Epoch 5/100 1/1 - 0s - loss: 0.6936 - acc: 0.5714 Epoch 6/100 1/1 - 0s - loss: 0.6922 - acc: 0.5714 Epoch 7/100 1/1 - 0s - loss: 0.6908 - acc: 0.7143 Epoch 8/100 1/1 - 0s - loss: 0.6894 - acc: 0.7143 Epoch 9/100 1/1 - 0s - loss: 0.6880 - acc: 0.7143 Epoch 10/100 1/1 - 0s - loss: 0.6866 - acc: 0.7143 Epoch 11/100 1/1 - 0s - loss: 0.6852 - acc: 0.7143 Epoch 12/100 1/1 - 0s - loss: 0.6838 - acc: 0.7143 Epoch 13/100 1/1 - 0s - loss: 0.6824 - acc: 0.8571 Epoch 14/100 1/1 - 0s - loss: 0.6810 - acc: 0.8571 Epoch 15/100 1/1 - 0s - loss: 0.6796 - acc: 0.8571 Epoch 16/100 1/1 - 0s - loss: 0.6782 - acc: 0.8571 Epoch 17/100 1/1 - 0s - loss: 0.6768 - acc: 0.8571 Epoch 18/100 1/1 - 0s - loss: 0.6754 - acc: 0.8571 Epoch 19/100 1/1 - 0s - loss: 0.6740 - acc: 1.0000 Epoch 20/100 1/1 - 0s - loss: 0.6726 - acc: 1.0000 Epoch 21/100 1/1 - 0s - loss: 0.6712 - acc: 1.0000 Epoch 22/100 1/1 - 0s - loss: 0.6698 - acc: 1.0000 Epoch 23/100 1/1 - 0s - loss: 0.6684 - acc: 1.0000 Epoch 24/100 1/1 - 0s - loss: 0.6669 - acc: 1.0000 Epoch 25/100 1/1 - 0s - loss: 0.6655 - acc: 1.0000 Epoch 26/100 1/1 - 0s - loss: 0.6641 - acc: 1.0000 Epoch 27/100 1/1 - 0s - loss: 0.6626 - acc: 1.0000 Epoch 28/100 1/1 - 0s - loss: 0.6612 - acc: 1.0000 Epoch 29/100 1/1 - 0s - loss: 0.6597 - acc: 1.0000 Epoch 30/100 1/1 - 0s - loss: 0.6583 - acc: 1.0000 Epoch 31/100 1/1 - 0s - loss: 0.6568 - acc: 1.0000 Epoch 32/100 1/1 - 0s - loss: 0.6553 - acc: 1.0000 Epoch 33/100 1/1 - 0s - loss: 0.6538 - acc: 1.0000 Epoch 34/100 1/1 - 0s - loss: 0.6523 - acc: 1.0000 Epoch 35/100 1/1 - 0s - loss: 0.6508 - acc: 1.0000 Epoch 36/100 1/1 - 0s - loss: 0.6493 - acc: 1.0000 Epoch 37/100 1/1 - 0s - loss: 0.6478 - acc: 1.0000 Epoch 38/100 1/1 - 0s - loss: 0.6462 - acc: 1.0000 Epoch 39/100 1/1 - 0s - loss: 0.6447 - acc: 1.0000 Epoch 40/100 1/1 - 0s - loss: 0.6431 - acc: 1.0000 Epoch 41/100 1/1 - 0s - loss: 0.6416 - acc: 1.0000 Epoch 42/100 1/1 - 0s - loss: 0.6400 - acc: 1.0000 Epoch 43/100 1/1 - 0s - loss: 0.6384 - acc: 1.0000 Epoch 44/100 1/1 - 0s - loss: 0.6368 - acc: 1.0000 Epoch 45/100 1/1 - 0s - loss: 0.6352 - acc: 1.0000 Epoch 46/100 1/1 - 0s - loss: 0.6336 - acc: 1.0000 Epoch 47/100 1/1 - 0s - loss: 0.6320 - acc: 1.0000 Epoch 48/100 1/1 - 0s - loss: 0.6304 - acc: 1.0000 Epoch 49/100 1/1 - 0s - loss: 0.6287 - acc: 1.0000 Epoch 50/100 1/1 - 0s - loss: 0.6271 - acc: 1.0000 Epoch 51/100 1/1 - 0s - loss: 0.6254 - acc: 1.0000 Epoch 52/100 1/1 - 0s - loss: 0.6237 - acc: 1.0000 Epoch 53/100 1/1 - 0s - loss: 0.6220 - acc: 1.0000 Epoch 54/100 1/1 - 0s - loss: 0.6203 - acc: 1.0000 Epoch 55/100 1/1 - 0s - loss: 0.6186 - acc: 1.0000 Epoch 56/100 1/1 - 0s - loss: 0.6169 - acc: 1.0000 Epoch 57/100 1/1 - 0s - loss: 0.6152 - acc: 1.0000 Epoch 58/100 1/1 - 0s - loss: 0.6135 - acc: 1.0000 Epoch 59/100 1/1 - 0s - loss: 0.6117 - acc: 1.0000 Epoch 60/100 1/1 - 0s - loss: 0.6100 - acc: 1.0000 Epoch 61/100 1/1 - 0s - loss: 0.6082 - acc: 1.0000 Epoch 62/100 1/1 - 0s - loss: 0.6065 - acc: 1.0000 Epoch 63/100 1/1 - 0s - loss: 0.6047 - acc: 1.0000 Epoch 64/100 1/1 - 0s - loss: 0.6029 - acc: 1.0000 Epoch 65/100 1/1 - 0s - loss: 0.6011 - acc: 1.0000 Epoch 66/100 1/1 - 0s - loss: 0.5993 - acc: 1.0000 Epoch 67/100 1/1 - 0s - loss: 0.5975 - acc: 1.0000 Epoch 68/100 1/1 - 0s - loss: 0.5956 - acc: 1.0000 Epoch 69/100 1/1 - 0s - loss: 0.5938 - acc: 1.0000 Epoch 70/100 1/1 - 0s - loss: 0.5920 - acc: 1.0000 Epoch 71/100 1/1 - 0s - loss: 0.5901 - acc: 1.0000 Epoch 72/100 1/1 - 0s - loss: 0.5882 - acc: 1.0000 Epoch 73/100 1/1 - 0s - loss: 0.5864 - acc: 1.0000 Epoch 74/100 1/1 - 0s - loss: 0.5845 - acc: 1.0000 Epoch 75/100 1/1 - 0s - loss: 0.5826 - acc: 1.0000 Epoch 76/100 1/1 - 0s - loss: 0.5807 - acc: 1.0000 Epoch 77/100 1/1 - 0s - loss: 0.5788 - acc: 1.0000 Epoch 78/100 1/1 - 0s - loss: 0.5769 - acc: 1.0000 Epoch 79/100 1/1 - 0s - loss: 0.5750 - acc: 1.0000 Epoch 80/100 1/1 - 0s - loss: 0.5731 - acc: 1.0000 Epoch 81/100 1/1 - 0s - loss: 0.5711 - acc: 1.0000 Epoch 82/100 1/1 - 0s - loss: 0.5692 - acc: 1.0000 Epoch 83/100 1/1 - 0s - loss: 0.5672 - acc: 1.0000 Epoch 84/100 1/1 - 0s - loss: 0.5653 - acc: 1.0000 Epoch 85/100 1/1 - 0s - loss: 0.5633 - acc: 1.0000 Epoch 86/100 1/1 - 0s - loss: 0.5614 - acc: 1.0000 Epoch 87/100 1/1 - 0s - loss: 0.5594 - acc: 1.0000 Epoch 88/100 1/1 - 0s - loss: 0.5574 - acc: 1.0000 Epoch 89/100 1/1 - 0s - loss: 0.5554 - acc: 1.0000 Epoch 90/100 1/1 - 0s - loss: 0.5534 - acc: 1.0000 Epoch 91/100 1/1 - 0s - loss: 0.5514 - acc: 1.0000 Epoch 92/100 1/1 - 0s - loss: 0.5494 - acc: 1.0000 Epoch 93/100 1/1 - 0s - loss: 0.5474 - acc: 1.0000 Epoch 94/100 1/1 - 0s - loss: 0.5454 - acc: 1.0000 Epoch 95/100 1/1 - 0s - loss: 0.5433 - acc: 1.0000 Epoch 96/100 1/1 - 0s - loss: 0.5413 - acc: 1.0000 Epoch 97/100 1/1 - 0s - loss: 0.5393 - acc: 1.0000 Epoch 98/100 1/1 - 0s - loss: 0.5372 - acc: 1.0000 Epoch 99/100 1/1 - 0s - loss: 0.5352 - acc: 1.0000 Epoch 100/100 1/1 - 0s - loss: 0.5331 - acc: 1.0000
<tensorflow.python.keras.callbacks.History at 0x7fad4b4b2d68>
print(X_train)
print(y_train)
[[ 1 2 3 4] [ 5 6 0 0] [ 7 8 0 0] [ 9 10 0 0] [11 12 0 0] [13 0 0 0] [14 15 0 0]] [1 0 0 1 1 0 1]
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
--2020-08-18 23:46:00-- http://nlp.stanford.edu/data/glove.6B.zip Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140 Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected. HTTP request sent, awaiting response... 302 Found Location: https://nlp.stanford.edu/data/glove.6B.zip [following] --2020-08-18 23:46:00-- https://nlp.stanford.edu/data/glove.6B.zip Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected. HTTP request sent, awaiting response... 301 Moved Permanently Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following] --2020-08-18 23:46:01-- http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22 Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 862182613 (822M) [application/zip] Saving to: ‘glove.6B.zip’ glove.6B.zip 100%[===================>] 822.24M 2.05MB/s in 6m 29s 2020-08-18 23:52:30 (2.11 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613] Archive: glove.6B.zip inflating: glove.6B.50d.txt inflating: glove.6B.100d.txt inflating: glove.6B.200d.txt inflating: glove.6B.300d.txt
n=0
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
word_vector = line.split() # 각 줄을 읽어와서 word_vector에 저장.
print(word_vector) # 각 줄을 출력
word = word_vector[0] # word_vector에서 첫번째 값만 저장
print(word) # word_vector의 첫번째 값만 출력
n=n+1
if n==2:
break
f.close()
print(type(word_vector))
print(len(word_vector))
['the', '-0.038194', '-0.24487', '0.72812', '-0.39961', '0.083172', '0.043953', '-0.39141', '0.3344', '-0.57545', '0.087459', '0.28787', '-0.06731', '0.30906', '-0.26384', '-0.13231', '-0.20757', '0.33395', '-0.33848', '-0.31743', '-0.48336', '0.1464', '-0.37304', '0.34577', '0.052041', '0.44946', '-0.46971', '0.02628', '-0.54155', '-0.15518', '-0.14107', '-0.039722', '0.28277', '0.14393', '0.23464', '-0.31021', '0.086173', '0.20397', '0.52624', '0.17164', '-0.082378', '-0.71787', '-0.41531', '0.20335', '-0.12763', '0.41367', '0.55187', '0.57908', '-0.33477', '-0.36559', '-0.54857', '-0.062892', '0.26584', '0.30205', '0.99775', '-0.80481', '-3.0243', '0.01254', '-0.36942', '2.2167', '0.72201', '-0.24978', '0.92136', '0.034514', '0.46745', '1.1079', '-0.19358', '-0.074575', '0.23353', '-0.052062', '-0.22044', '0.057162', '-0.15806', '-0.30798', '-0.41625', '0.37972', '0.15006', '-0.53212', '-0.2055', '-1.2526', '0.071624', '0.70565', '0.49744', '-0.42063', '0.26148', '-1.538', '-0.30223', '-0.073438', '-0.28312', '0.37104', '-0.25217', '0.016215', '-0.017099', '-0.38984', '0.87424', '-0.72569', '-0.51058', '-0.52028', '-0.1459', '0.8278', '0.27062'] the [',', '-0.10767', '0.11053', '0.59812', '-0.54361', '0.67396', '0.10663', '0.038867', '0.35481', '0.06351', '-0.094189', '0.15786', '-0.81665', '0.14172', '0.21939', '0.58505', '-0.52158', '0.22783', '-0.16642', '-0.68228', '0.3587', '0.42568', '0.19021', '0.91963', '0.57555', '0.46185', '0.42363', '-0.095399', '-0.42749', '-0.16567', '-0.056842', '-0.29595', '0.26037', '-0.26606', '-0.070404', '-0.27662', '0.15821', '0.69825', '0.43081', '0.27952', '-0.45437', '-0.33801', '-0.58184', '0.22364', '-0.5778', '-0.26862', '-0.20425', '0.56394', '-0.58524', '-0.14365', '-0.64218', '0.0054697', '-0.35248', '0.16162', '1.1796', '-0.47674', '-2.7553', '-0.1321', '-0.047729', '1.0655', '1.1034', '-0.2208', '0.18669', '0.13177', '0.15117', '0.7131', '-0.35215', '0.91348', '0.61783', '0.70992', '0.23955', '-0.14571', '-0.37859', '-0.045959', '-0.47368', '0.2385', '0.20536', '-0.18996', '0.32507', '-1.1112', '-0.36341', '0.98679', '-0.084776', '-0.54008', '0.11726', '-1.0194', '-0.24424', '0.12771', '0.013884', '0.080374', '-0.35414', '0.34951', '-0.7226', '0.37549', '0.4441', '-0.99059', '0.61214', '-0.35111', '-0.83155', '0.45293', '0.082577'] , <class 'list'> 101
import numpy as np
embedding_dict = dict()
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
word_vector = line.split()
word = word_vector[0]
word_vector_arr = np.asarray(word_vector[1:], dtype='float32') # 100개의 값을 가지는 array로 변환
embedding_dict[word] = word_vector_arr
f.close()
print('%s개의 Embedding vector가 있습니다.' % len(embedding_dict))
print(embedding_dict['respectable'])
print(len(embedding_dict['respectable']))
embedding_matrix = np.zeros((vocab_size, 100))
# 단어 집합 크기의 행과 100개의 열을 가지는 행렬 생성. 값은 전부 0으로 채워진다.
np.shape(embedding_matrix)
print(t.word_index.items())
for word, i in t.word_index.items(): # 훈련 데이터의 단어 집합에서 단어를 1개씩 꺼내온다.
temp = embedding_dict.get(word) # 단어(key) 해당되는 임베딩 벡터의 100개의 값(value)를 임시 변수에 저장
if temp is not None:
embedding_matrix[i] = temp # 임수 변수의 값을 단어와 맵핑되는 인덱스의 행에 삽입
400000개의 Embedding vector가 있습니다.
[-0.049773 0.19903 0.10585 0.1391 -0.32395 0.44053
0.3947 -0.22805 -0.25793 0.49768 0.15384 -0.08831
0.0782 -0.8299 -0.037788 0.16772 -0.45197 -0.17085
0.74756 0.98256 0.81872 0.28507 0.16178 -0.48626
-0.006265 -0.92469 -0.30625 -0.067318 -0.046762 -0.76291
-0.0025264 -0.018795 0.12882 -0.52457 0.3586 0.43119
-0.89477 -0.057421 -0.53724 0.25587 0.55195 0.44698
-0.24252 0.29946 0.25776 -0.8717 0.68426 -0.05688
-0.1848 -0.59352 -0.11227 -0.57692 -0.013593 0.18488
-0.32507 -0.90171 0.17672 0.075601 0.54896 -0.21488
-0.54018 -0.45882 -0.79536 0.26331 0.18879 -0.16363
0.3975 0.1099 0.1164 -0.083499 0.50159 0.35802
0.25677 0.088546 0.42108 0.28674 -0.71285 -0.82915
0.15297 -0.82712 0.022112 1.067 -0.31776 0.1211
-0.069755 -0.61327 0.27308 -0.42638 -0.085084 -0.17694
-0.0090944 0.1109 0.62543 -0.23682 -0.44928 -0.3667
-0.21616 -0.19187 -0.032502 0.38025 ]
100
dict_items([('nice', 1), ('great', 2), ('best', 3), ('amazing', 4), ('stop', 5), ('lies', 6), ('pitiful', 7), ('nerd', 8), ('excellent', 9), ('work', 10), ('supreme', 11), ('quality', 12), ('bad', 13), ('highly', 14), ('respectable', 15)])
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)
Epoch 1/100 1/1 - 0s - loss: 0.6723 - acc: 0.4286 Epoch 2/100 1/1 - 0s - loss: 0.6529 - acc: 0.5714 Epoch 3/100 1/1 - 0s - loss: 0.6343 - acc: 0.5714 Epoch 4/100 1/1 - 0s - loss: 0.6164 - acc: 0.5714 Epoch 5/100 1/1 - 0s - loss: 0.5993 - acc: 0.5714 Epoch 6/100 1/1 - 0s - loss: 0.5828 - acc: 0.5714 Epoch 7/100 1/1 - 0s - loss: 0.5670 - acc: 0.7143 Epoch 8/100 1/1 - 0s - loss: 0.5519 - acc: 0.7143 Epoch 9/100 1/1 - 0s - loss: 0.5373 - acc: 0.7143 Epoch 10/100 1/1 - 0s - loss: 0.5234 - acc: 0.7143 Epoch 11/100 1/1 - 0s - loss: 0.5100 - acc: 0.7143 Epoch 12/100 1/1 - 0s - loss: 0.4971 - acc: 0.7143 Epoch 13/100 1/1 - 0s - loss: 0.4847 - acc: 1.0000 Epoch 14/100 1/1 - 0s - loss: 0.4727 - acc: 1.0000 Epoch 15/100 1/1 - 0s - loss: 0.4612 - acc: 1.0000 Epoch 16/100 1/1 - 0s - loss: 0.4500 - acc: 1.0000 Epoch 17/100 1/1 - 0s - loss: 0.4392 - acc: 1.0000 Epoch 18/100 1/1 - 0s - loss: 0.4288 - acc: 1.0000 Epoch 19/100 1/1 - 0s - loss: 0.4187 - acc: 1.0000 Epoch 20/100 1/1 - 0s - loss: 0.4089 - acc: 1.0000 Epoch 21/100 1/1 - 0s - loss: 0.3994 - acc: 1.0000 Epoch 22/100 1/1 - 0s - loss: 0.3902 - acc: 1.0000 Epoch 23/100 1/1 - 0s - loss: 0.3812 - acc: 1.0000 Epoch 24/100 1/1 - 0s - loss: 0.3725 - acc: 1.0000 Epoch 25/100 1/1 - 0s - loss: 0.3641 - acc: 1.0000 Epoch 26/100 1/1 - 0s - loss: 0.3559 - acc: 1.0000 Epoch 27/100 1/1 - 0s - loss: 0.3479 - acc: 1.0000 Epoch 28/100 1/1 - 0s - loss: 0.3401 - acc: 1.0000 Epoch 29/100 1/1 - 0s - loss: 0.3326 - acc: 1.0000 Epoch 30/100 1/1 - 0s - loss: 0.3253 - acc: 1.0000 Epoch 31/100 1/1 - 0s - loss: 0.3182 - acc: 1.0000 Epoch 32/100 1/1 - 0s - loss: 0.3112 - acc: 1.0000 Epoch 33/100 1/1 - 0s - loss: 0.3045 - acc: 1.0000 Epoch 34/100 1/1 - 0s - loss: 0.2979 - acc: 1.0000 Epoch 35/100 1/1 - 0s - loss: 0.2916 - acc: 1.0000 Epoch 36/100 1/1 - 0s - loss: 0.2854 - acc: 1.0000 Epoch 37/100 1/1 - 0s - loss: 0.2793 - acc: 1.0000 Epoch 38/100 1/1 - 0s - loss: 0.2735 - acc: 1.0000 Epoch 39/100 1/1 - 0s - loss: 0.2678 - acc: 1.0000 Epoch 40/100 1/1 - 0s - loss: 0.2622 - acc: 1.0000 Epoch 41/100 1/1 - 0s - loss: 0.2569 - acc: 1.0000 Epoch 42/100 1/1 - 0s - loss: 0.2516 - acc: 1.0000 Epoch 43/100 1/1 - 0s - loss: 0.2465 - acc: 1.0000 Epoch 44/100 1/1 - 0s - loss: 0.2416 - acc: 1.0000 Epoch 45/100 1/1 - 0s - loss: 0.2368 - acc: 1.0000 Epoch 46/100 1/1 - 0s - loss: 0.2321 - acc: 1.0000 Epoch 47/100 1/1 - 0s - loss: 0.2275 - acc: 1.0000 Epoch 48/100 1/1 - 0s - loss: 0.2231 - acc: 1.0000 Epoch 49/100 1/1 - 0s - loss: 0.2188 - acc: 1.0000 Epoch 50/100 1/1 - 0s - loss: 0.2146 - acc: 1.0000 Epoch 51/100 1/1 - 0s - loss: 0.2106 - acc: 1.0000 Epoch 52/100 1/1 - 0s - loss: 0.2066 - acc: 1.0000 Epoch 53/100 1/1 - 0s - loss: 0.2028 - acc: 1.0000 Epoch 54/100 1/1 - 0s - loss: 0.1990 - acc: 1.0000 Epoch 55/100 1/1 - 0s - loss: 0.1954 - acc: 1.0000 Epoch 56/100 1/1 - 0s - loss: 0.1918 - acc: 1.0000 Epoch 57/100 1/1 - 0s - loss: 0.1884 - acc: 1.0000 Epoch 58/100 1/1 - 0s - loss: 0.1850 - acc: 1.0000 Epoch 59/100 1/1 - 0s - loss: 0.1817 - acc: 1.0000 Epoch 60/100 1/1 - 0s - loss: 0.1785 - acc: 1.0000 Epoch 61/100 1/1 - 0s - loss: 0.1754 - acc: 1.0000 Epoch 62/100 1/1 - 0s - loss: 0.1724 - acc: 1.0000 Epoch 63/100 1/1 - 0s - loss: 0.1695 - acc: 1.0000 Epoch 64/100 1/1 - 0s - loss: 0.1666 - acc: 1.0000 Epoch 65/100 1/1 - 0s - loss: 0.1638 - acc: 1.0000 Epoch 66/100 1/1 - 0s - loss: 0.1611 - acc: 1.0000 Epoch 67/100 1/1 - 0s - loss: 0.1584 - acc: 1.0000 Epoch 68/100 1/1 - 0s - loss: 0.1558 - acc: 1.0000 Epoch 69/100 1/1 - 0s - loss: 0.1533 - acc: 1.0000 Epoch 70/100 1/1 - 0s - loss: 0.1509 - acc: 1.0000 Epoch 71/100 1/1 - 0s - loss: 0.1485 - acc: 1.0000 Epoch 72/100 1/1 - 0s - loss: 0.1461 - acc: 1.0000 Epoch 73/100 1/1 - 0s - loss: 0.1438 - acc: 1.0000 Epoch 74/100 1/1 - 0s - loss: 0.1416 - acc: 1.0000 Epoch 75/100 1/1 - 0s - loss: 0.1394 - acc: 1.0000 Epoch 76/100 1/1 - 0s - loss: 0.1373 - acc: 1.0000 Epoch 77/100 1/1 - 0s - loss: 0.1352 - acc: 1.0000 Epoch 78/100 1/1 - 0s - loss: 0.1332 - acc: 1.0000 Epoch 79/100 1/1 - 0s - loss: 0.1312 - acc: 1.0000 Epoch 80/100 1/1 - 0s - loss: 0.1293 - acc: 1.0000 Epoch 81/100 1/1 - 0s - loss: 0.1274 - acc: 1.0000 Epoch 82/100 1/1 - 0s - loss: 0.1255 - acc: 1.0000 Epoch 83/100 1/1 - 0s - loss: 0.1237 - acc: 1.0000 Epoch 84/100 1/1 - 0s - loss: 0.1220 - acc: 1.0000 Epoch 85/100 1/1 - 0s - loss: 0.1202 - acc: 1.0000 Epoch 86/100 1/1 - 0s - loss: 0.1186 - acc: 1.0000 Epoch 87/100 1/1 - 0s - loss: 0.1169 - acc: 1.0000 Epoch 88/100 1/1 - 0s - loss: 0.1153 - acc: 1.0000 Epoch 89/100 1/1 - 0s - loss: 0.1137 - acc: 1.0000 Epoch 90/100 1/1 - 0s - loss: 0.1122 - acc: 1.0000 Epoch 91/100 1/1 - 0s - loss: 0.1107 - acc: 1.0000 Epoch 92/100 1/1 - 0s - loss: 0.1092 - acc: 1.0000 Epoch 93/100 1/1 - 0s - loss: 0.1077 - acc: 1.0000 Epoch 94/100 1/1 - 0s - loss: 0.1063 - acc: 1.0000 Epoch 95/100 1/1 - 0s - loss: 0.1049 - acc: 1.0000 Epoch 96/100 1/1 - 0s - loss: 0.1036 - acc: 1.0000 Epoch 97/100 1/1 - 0s - loss: 0.1022 - acc: 1.0000 Epoch 98/100 1/1 - 0s - loss: 0.1009 - acc: 1.0000 Epoch 99/100 1/1 - 0s - loss: 0.0997 - acc: 1.0000 Epoch 100/100 1/1 - 0s - loss: 0.0984 - acc: 1.0000
<tensorflow.python.keras.callbacks.History at 0x7fad45137be0>
import numpy as np
import gensim
# 현재 위치에 구글의 사전 훈련된 Word2Vec을 다운로드
!wget "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
# 구글의 사전 훈련된 Word2vec 모델을 로드합니다.
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
print(word2vec_model.vectors.shape) # 모델의 크기 확인
embedding_matrix = np.zeros((vocab_size, 300))
# 단어 집합 크기의 행과 300개의 열을 가지는 행렬 생성. 값은 전부 0으로 채워진다.
np.shape(embedding_matrix)
def get_vector(word):
if word in word2vec_model:
return word2vec_model[word]
else:
return None
for word, i in t.word_index.items(): # 훈련 데이터의 단어 집합에서 단어와 정수 인덱스를 1개씩 꺼내온다.
temp = get_vector(word) # 단어(key) 해당되는 임베딩 벡터의 300개의 값(value)를 임시 변수에 저장
if temp is not None: # 만약 None이 아니라면 임베딩 벡터의 값을 리턴받은 것이므로
embedding_matrix[i] = temp # 해당 단어 위치의 행에 벡터의 값을 저장한다.
print(word2vec_model['nice'])
print('단어 nice의 정수 인덱스 :', t.word_index['nice'])
print(embedding_matrix[1])
--2020-08-18 23:53:25-- https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.108.29 Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.108.29|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 1647046227 (1.5G) [application/x-gzip] Saving to: ‘GoogleNews-vectors-negative300.bin.gz’ GoogleNews-vectors- 100%[===================>] 1.53G 16.4MB/s in 99s 2020-08-18 23:55:04 (15.9 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]
/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:254: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
(3000000, 300) [ 0.15820312 0.10595703 -0.18945312 0.38671875 0.08349609 -0.26757812 0.08349609 0.11328125 -0.10400391 0.17871094 -0.12353516 -0.22265625 -0.01806641 -0.25390625 0.13183594 0.0859375 0.16113281 0.11083984 -0.11083984 -0.0859375 0.0267334 0.34570312 0.15136719 -0.00415039 0.10498047 0.04907227 -0.06982422 0.08642578 0.03198242 -0.02844238 -0.15722656 0.11865234 0.36132812 0.00173187 0.05297852 -0.234375 0.11767578 0.08642578 -0.01123047 0.25976562 0.28515625 -0.11669922 0.38476562 0.07275391 0.01147461 0.03466797 0.18164062 -0.03955078 0.04199219 0.01013184 -0.06054688 0.09765625 0.06689453 0.14648438 -0.12011719 0.08447266 -0.06152344 0.06347656 0.3046875 -0.35546875 -0.2890625 0.19628906 -0.33203125 -0.07128906 0.12792969 0.09619141 -0.12158203 -0.08691406 -0.12890625 0.27734375 0.265625 0.1796875 0.12695312 0.06298828 -0.34375 -0.05908203 0.0456543 0.171875 0.08935547 0.14648438 -0.04638672 -0.00842285 -0.0279541 0.234375 -0.07470703 -0.13574219 0.00378418 0.19433594 0.05664062 -0.05419922 0.06176758 0.14160156 -0.24121094 0.02539062 -0.15917969 -0.10595703 0.11865234 0.24707031 -0.13574219 -0.20410156 -0.30078125 0.07910156 -0.04394531 0.02026367 -0.05786133 0.2109375 0.13574219 0.08349609 -0.0098877 -0.10546875 -0.08105469 0.03735352 -0.10351562 -0.10205078 0.23925781 -0.21875 0.05151367 0.06738281 0.07617188 0.04638672 0.03198242 -0.07275391 0.14550781 0.04858398 -0.05664062 -0.07470703 -0.0030365 -0.09277344 -0.11083984 -0.03320312 -0.15234375 -0.12207031 0.09814453 0.375 0.00454712 -0.10009766 0.02734375 0.30078125 -0.0390625 0.30078125 -0.04541016 -0.00424194 0.13671875 -0.18945312 -0.21777344 0.12695312 -0.02746582 -0.18164062 0.08984375 -0.23339844 0.203125 0.2734375 -0.26953125 0.15332031 -0.20703125 -0.01153564 0.12451172 0.05395508 -0.23535156 -0.01409912 -0.09765625 0.20800781 0.19335938 0.14746094 0.28710938 -0.23046875 0.01965332 -0.09619141 -0.0703125 -0.04174805 -0.17578125 0.0007019 0.10546875 0.10351562 0.02478027 0.35742188 0.17382812 -0.09570312 -0.18359375 0.23242188 -0.14453125 -0.20410156 -0.01867676 0.06640625 -0.2265625 -0.00582886 -0.08642578 0.02416992 -0.07324219 -0.29882812 -0.15625 0.07666016 0.19628906 -0.20410156 0.09863281 -0.01672363 -0.18652344 -0.12353516 -0.16015625 -0.10058594 0.21777344 0.09375 -0.10058594 -0.03637695 0.15136719 -0.02526855 -0.23730469 0.03417969 -0.00604248 0.15625 -0.14257812 0.18066406 -0.35351562 0.25 0.13085938 -0.04296875 0.17089844 0.20507812 0.00680542 -0.08251953 -0.06738281 0.22167969 -0.16308594 -0.16699219 -0.02087402 0.11035156 0.06054688 -0.04223633 -0.17285156 0.05029297 -0.19824219 0.01495361 0.06542969 0.03271484 0.14453125 -0.08691406 -0.11035156 -0.1484375 0.09667969 0.22363281 0.23535156 0.08398438 0.18164062 -0.10595703 -0.04296875 0.11572266 -0.00153351 0.0534668 -0.1328125 -0.33203125 -0.08251953 0.30664062 0.22363281 0.27929688 0.09082031 -0.18066406 -0.00613403 -0.09423828 -0.21289062 0.01965332 -0.08105469 -0.06689453 -0.31835938 -0.08447266 0.13574219 0.0625 0.07080078 -0.14257812 -0.11279297 0.01452637 -0.06689453 0.03881836 0.19433594 0.09521484 0.11376953 -0.12451172 0.13769531 -0.18847656 -0.05224609 0.15820312 0.09863281 -0.04370117 -0.06054688 0.21679688 0.04077148 -0.14648438 -0.18945312 -0.25195312 -0.16894531 -0.08642578 -0.08544922 0.18945312 -0.14648438 0.13476562 -0.04077148 0.03271484 0.08935547 -0.26757812 0.00836182 -0.21386719] 단어 nice의 정수 인덱스 : 1 [ 0.15820312 0.10595703 -0.18945312 0.38671875 0.08349609 -0.26757812 0.08349609 0.11328125 -0.10400391 0.17871094 -0.12353516 -0.22265625 -0.01806641 -0.25390625 0.13183594 0.0859375 0.16113281 0.11083984 -0.11083984 -0.0859375 0.0267334 0.34570312 0.15136719 -0.00415039 0.10498047 0.04907227 -0.06982422 0.08642578 0.03198242 -0.02844238 -0.15722656 0.11865234 0.36132812 0.00173187 0.05297852 -0.234375 0.11767578 0.08642578 -0.01123047 0.25976562 0.28515625 -0.11669922 0.38476562 0.07275391 0.01147461 0.03466797 0.18164062 -0.03955078 0.04199219 0.01013184 -0.06054688 0.09765625 0.06689453 0.14648438 -0.12011719 0.08447266 -0.06152344 0.06347656 0.3046875 -0.35546875 -0.2890625 0.19628906 -0.33203125 -0.07128906 0.12792969 0.09619141 -0.12158203 -0.08691406 -0.12890625 0.27734375 0.265625 0.1796875 0.12695312 0.06298828 -0.34375 -0.05908203 0.0456543 0.171875 0.08935547 0.14648438 -0.04638672 -0.00842285 -0.0279541 0.234375 -0.07470703 -0.13574219 0.00378418 0.19433594 0.05664062 -0.05419922 0.06176758 0.14160156 -0.24121094 0.02539062 -0.15917969 -0.10595703 0.11865234 0.24707031 -0.13574219 -0.20410156 -0.30078125 0.07910156 -0.04394531 0.02026367 -0.05786133 0.2109375 0.13574219 0.08349609 -0.0098877 -0.10546875 -0.08105469 0.03735352 -0.10351562 -0.10205078 0.23925781 -0.21875 0.05151367 0.06738281 0.07617188 0.04638672 0.03198242 -0.07275391 0.14550781 0.04858398 -0.05664062 -0.07470703 -0.0030365 -0.09277344 -0.11083984 -0.03320312 -0.15234375 -0.12207031 0.09814453 0.375 0.00454712 -0.10009766 0.02734375 0.30078125 -0.0390625 0.30078125 -0.04541016 -0.00424194 0.13671875 -0.18945312 -0.21777344 0.12695312 -0.02746582 -0.18164062 0.08984375 -0.23339844 0.203125 0.2734375 -0.26953125 0.15332031 -0.20703125 -0.01153564 0.12451172 0.05395508 -0.23535156 -0.01409912 -0.09765625 0.20800781 0.19335938 0.14746094 0.28710938 -0.23046875 0.01965332 -0.09619141 -0.0703125 -0.04174805 -0.17578125 0.0007019 0.10546875 0.10351562 0.02478027 0.35742188 0.17382812 -0.09570312 -0.18359375 0.23242188 -0.14453125 -0.20410156 -0.01867676 0.06640625 -0.2265625 -0.00582886 -0.08642578 0.02416992 -0.07324219 -0.29882812 -0.15625 0.07666016 0.19628906 -0.20410156 0.09863281 -0.01672363 -0.18652344 -0.12353516 -0.16015625 -0.10058594 0.21777344 0.09375 -0.10058594 -0.03637695 0.15136719 -0.02526855 -0.23730469 0.03417969 -0.00604248 0.15625 -0.14257812 0.18066406 -0.35351562 0.25 0.13085938 -0.04296875 0.17089844 0.20507812 0.00680542 -0.08251953 -0.06738281 0.22167969 -0.16308594 -0.16699219 -0.02087402 0.11035156 0.06054688 -0.04223633 -0.17285156 0.05029297 -0.19824219 0.01495361 0.06542969 0.03271484 0.14453125 -0.08691406 -0.11035156 -0.1484375 0.09667969 0.22363281 0.23535156 0.08398438 0.18164062 -0.10595703 -0.04296875 0.11572266 -0.00153351 0.0534668 -0.1328125 -0.33203125 -0.08251953 0.30664062 0.22363281 0.27929688 0.09082031 -0.18066406 -0.00613403 -0.09423828 -0.21289062 0.01965332 -0.08105469 -0.06689453 -0.31835938 -0.08447266 0.13574219 0.0625 0.07080078 -0.14257812 -0.11279297 0.01452637 -0.06689453 0.03881836 0.19433594 0.09521484 0.11376953 -0.12451172 0.13769531 -0.18847656 -0.05224609 0.15820312 0.09863281 -0.04370117 -0.06054688 0.21679688 0.04077148 -0.14648438 -0.18945312 -0.25195312 -0.16894531 -0.08642578 -0.08544922 0.18945312 -0.14648438 0.13476562 -0.04077148 0.03271484 0.08935547 -0.26757812 0.00836182 -0.21386719]
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
model = Sequential()
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_len, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)
Epoch 1/100 1/1 - 0s - loss: 0.7057 - acc: 0.5714 Epoch 2/100 1/1 - 0s - loss: 0.6868 - acc: 0.5714 Epoch 3/100 1/1 - 0s - loss: 0.6683 - acc: 0.7143 Epoch 4/100 1/1 - 0s - loss: 0.6504 - acc: 0.7143 Epoch 5/100 1/1 - 0s - loss: 0.6331 - acc: 0.7143 Epoch 6/100 1/1 - 0s - loss: 0.6163 - acc: 0.7143 Epoch 7/100 1/1 - 0s - loss: 0.6001 - acc: 0.7143 Epoch 8/100 1/1 - 0s - loss: 0.5844 - acc: 0.7143 Epoch 9/100 1/1 - 0s - loss: 0.5693 - acc: 0.7143 Epoch 10/100 1/1 - 0s - loss: 0.5547 - acc: 0.7143 Epoch 11/100 1/1 - 0s - loss: 0.5406 - acc: 0.8571 Epoch 12/100 1/1 - 0s - loss: 0.5270 - acc: 0.8571 Epoch 13/100 1/1 - 0s - loss: 0.5138 - acc: 0.8571 Epoch 14/100 1/1 - 0s - loss: 0.5012 - acc: 0.8571 Epoch 15/100 1/1 - 0s - loss: 0.4889 - acc: 0.8571 Epoch 16/100 1/1 - 0s - loss: 0.4771 - acc: 0.8571 Epoch 17/100 1/1 - 0s - loss: 0.4657 - acc: 0.8571 Epoch 18/100 1/1 - 0s - loss: 0.4547 - acc: 0.8571 Epoch 19/100 1/1 - 0s - loss: 0.4440 - acc: 0.8571 Epoch 20/100 1/1 - 0s - loss: 0.4337 - acc: 1.0000 Epoch 21/100 1/1 - 0s - loss: 0.4237 - acc: 1.0000 Epoch 22/100 1/1 - 0s - loss: 0.4141 - acc: 1.0000 Epoch 23/100 1/1 - 0s - loss: 0.4047 - acc: 1.0000 Epoch 24/100 1/1 - 0s - loss: 0.3957 - acc: 1.0000 Epoch 25/100 1/1 - 0s - loss: 0.3869 - acc: 1.0000 Epoch 26/100 1/1 - 0s - loss: 0.3784 - acc: 1.0000 Epoch 27/100 1/1 - 0s - loss: 0.3702 - acc: 1.0000 Epoch 28/100 1/1 - 0s - loss: 0.3622 - acc: 1.0000 Epoch 29/100 1/1 - 0s - loss: 0.3545 - acc: 1.0000 Epoch 30/100 1/1 - 0s - loss: 0.3470 - acc: 1.0000 Epoch 31/100 1/1 - 0s - loss: 0.3397 - acc: 1.0000 Epoch 32/100 1/1 - 0s - loss: 0.3326 - acc: 1.0000 Epoch 33/100 1/1 - 0s - loss: 0.3257 - acc: 1.0000 Epoch 34/100 1/1 - 0s - loss: 0.3191 - acc: 1.0000 Epoch 35/100 1/1 - 0s - loss: 0.3126 - acc: 1.0000 Epoch 36/100 1/1 - 0s - loss: 0.3063 - acc: 1.0000 Epoch 37/100 1/1 - 0s - loss: 0.3002 - acc: 1.0000 Epoch 38/100 1/1 - 0s - loss: 0.2943 - acc: 1.0000 Epoch 39/100 1/1 - 0s - loss: 0.2885 - acc: 1.0000 Epoch 40/100 1/1 - 0s - loss: 0.2829 - acc: 1.0000 Epoch 41/100 1/1 - 0s - loss: 0.2774 - acc: 1.0000 Epoch 42/100 1/1 - 0s - loss: 0.2721 - acc: 1.0000 Epoch 43/100 1/1 - 0s - loss: 0.2670 - acc: 1.0000 Epoch 44/100 1/1 - 0s - loss: 0.2620 - acc: 1.0000 Epoch 45/100 1/1 - 0s - loss: 0.2571 - acc: 1.0000 Epoch 46/100 1/1 - 0s - loss: 0.2524 - acc: 1.0000 Epoch 47/100 1/1 - 0s - loss: 0.2478 - acc: 1.0000 Epoch 48/100 1/1 - 0s - loss: 0.2433 - acc: 1.0000 Epoch 49/100 1/1 - 0s - loss: 0.2389 - acc: 1.0000 Epoch 50/100 1/1 - 0s - loss: 0.2347 - acc: 1.0000 Epoch 51/100 1/1 - 0s - loss: 0.2305 - acc: 1.0000 Epoch 52/100 1/1 - 0s - loss: 0.2265 - acc: 1.0000 Epoch 53/100 1/1 - 0s - loss: 0.2226 - acc: 1.0000 Epoch 54/100 1/1 - 0s - loss: 0.2187 - acc: 1.0000 Epoch 55/100 1/1 - 0s - loss: 0.2150 - acc: 1.0000 Epoch 56/100 1/1 - 0s - loss: 0.2114 - acc: 1.0000 Epoch 57/100 1/1 - 0s - loss: 0.2079 - acc: 1.0000 Epoch 58/100 1/1 - 0s - loss: 0.2044 - acc: 1.0000 Epoch 59/100 1/1 - 0s - loss: 0.2010 - acc: 1.0000 Epoch 60/100 1/1 - 0s - loss: 0.1978 - acc: 1.0000 Epoch 61/100 1/1 - 0s - loss: 0.1946 - acc: 1.0000 Epoch 62/100 1/1 - 0s - loss: 0.1914 - acc: 1.0000 Epoch 63/100 1/1 - 0s - loss: 0.1884 - acc: 1.0000 Epoch 64/100 1/1 - 0s - loss: 0.1854 - acc: 1.0000 Epoch 65/100 1/1 - 0s - loss: 0.1825 - acc: 1.0000 Epoch 66/100 1/1 - 0s - loss: 0.1797 - acc: 1.0000 Epoch 67/100 1/1 - 0s - loss: 0.1769 - acc: 1.0000 Epoch 68/100 1/1 - 0s - loss: 0.1742 - acc: 1.0000 Epoch 69/100 1/1 - 0s - loss: 0.1716 - acc: 1.0000 Epoch 70/100 1/1 - 0s - loss: 0.1690 - acc: 1.0000 Epoch 71/100 1/1 - 0s - loss: 0.1665 - acc: 1.0000 Epoch 72/100 1/1 - 0s - loss: 0.1641 - acc: 1.0000 Epoch 73/100 1/1 - 0s - loss: 0.1616 - acc: 1.0000 Epoch 74/100 1/1 - 0s - loss: 0.1593 - acc: 1.0000 Epoch 75/100 1/1 - 0s - loss: 0.1570 - acc: 1.0000 Epoch 76/100 1/1 - 0s - loss: 0.1548 - acc: 1.0000 Epoch 77/100 1/1 - 0s - loss: 0.1526 - acc: 1.0000 Epoch 78/100 1/1 - 0s - loss: 0.1504 - acc: 1.0000 Epoch 79/100 1/1 - 0s - loss: 0.1483 - acc: 1.0000 Epoch 80/100 1/1 - 0s - loss: 0.1463 - acc: 1.0000 Epoch 81/100 1/1 - 0s - loss: 0.1442 - acc: 1.0000 Epoch 82/100 1/1 - 0s - loss: 0.1423 - acc: 1.0000 Epoch 83/100 1/1 - 0s - loss: 0.1404 - acc: 1.0000 Epoch 84/100 1/1 - 0s - loss: 0.1385 - acc: 1.0000 Epoch 85/100 1/1 - 0s - loss: 0.1366 - acc: 1.0000 Epoch 86/100 1/1 - 0s - loss: 0.1348 - acc: 1.0000 Epoch 87/100 1/1 - 0s - loss: 0.1330 - acc: 1.0000 Epoch 88/100 1/1 - 0s - loss: 0.1313 - acc: 1.0000 Epoch 89/100 1/1 - 0s - loss: 0.1296 - acc: 1.0000 Epoch 90/100 1/1 - 0s - loss: 0.1279 - acc: 1.0000 Epoch 91/100 1/1 - 0s - loss: 0.1263 - acc: 1.0000 Epoch 92/100 1/1 - 0s - loss: 0.1247 - acc: 1.0000 Epoch 93/100 1/1 - 0s - loss: 0.1231 - acc: 1.0000 Epoch 94/100 1/1 - 0s - loss: 0.1216 - acc: 1.0000 Epoch 95/100 1/1 - 0s - loss: 0.1201 - acc: 1.0000 Epoch 96/100 1/1 - 0s - loss: 0.1186 - acc: 1.0000 Epoch 97/100 1/1 - 0s - loss: 0.1172 - acc: 1.0000 Epoch 98/100 1/1 - 0s - loss: 0.1157 - acc: 1.0000 Epoch 99/100 1/1 - 0s - loss: 0.1143 - acc: 1.0000 Epoch 100/100 1/1 - 0s - loss: 0.1130 - acc: 1.0000
<tensorflow.python.keras.callbacks.History at 0x7fad063b25c0>
from google.colab import drive
drive.mount('/content/drive')
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly Enter your authorization code: ·········· Mounted at /content/drive
구글이 2015년에 공개한 머신 러닝 오픈소스 라이브러리.
머신 러닝과 딥 러닝을 직관적이고 손쉽게 할 수 있도록 설계되었음.
딥 러닝 프레임워크인 텐서플로우에 대한 추상화 된 API를 제공.
텐서플로우보다 간단한 코드 작성 가능.
토픽 모델링과 자연어 처리 등을 수행할 수 있게 해주는 오픈 소스 라이브러리.
Word2Vec과 토픽모델링 사용 가능.
나이브 베이즈 분류, 서포트 벡터 머신 등 다양한 머신 러닝 모듈 제공.
아이리스 데이터, 당뇨병 데이터 등 자체 데이터 제공.
import nltk
nltk.download('treebank')
[nltk_data] Downloading package treebank to /root/nltk_data... [nltk_data] Unzipping corpora/treebank.zip.
True
!pip install konlpy
Collecting konlpy
Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
|████████████████████████████████| 19.4MB 1.3MB/s
Requirement already satisfied: numpy>=1.6 in /usr/local/lib/python3.6/dist-packages (from konlpy) (1.18.5)
Collecting JPype1>=0.7.0
Downloading https://files.pythonhosted.org/packages/8b/f7/a368401e630f0e390dd0e62c39fb928e5b23741b53c2360ee7d376660927/JPype1-1.0.2-cp36-cp36m-manylinux2010_x86_64.whl (3.8MB)
|████████████████████████████████| 3.8MB 47.7MB/s
Collecting colorama
Downloading https://files.pythonhosted.org/packages/c9/dc/45cdef1b4d119eb96316b3117e6d5708a08029992b2fee2c143c7a0a5cc5/colorama-0.4.3-py2.py3-none-any.whl
Requirement already satisfied: lxml>=4.1.0 in /usr/local/lib/python3.6/dist-packages (from konlpy) (4.2.6)
Collecting tweepy>=3.7.0
Downloading https://files.pythonhosted.org/packages/bb/7c/99d51f80f3b77b107ebae2634108717362c059a41384a1810d13e2429a81/tweepy-3.9.0-py2.py3-none-any.whl
Collecting beautifulsoup4==4.6.0
Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
|████████████████████████████████| 92kB 8.4MB/s
Requirement already satisfied: typing-extensions; python_version < "3.8" in /usr/local/lib/python3.6/dist-packages (from JPype1>=0.7.0->konlpy) (3.7.4.2)
Requirement already satisfied: requests[socks]>=2.11.1 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (2.23.0)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (1.15.0)
Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tweepy>=3.7.0->konlpy) (1.3.0)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2.10)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (2020.6.20)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.24.3)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6; extra == "socks" in /usr/local/lib/python3.6/dist-packages (from requests[socks]>=2.11.1->tweepy>=3.7.0->konlpy) (1.7.1)
Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->tweepy>=3.7.0->konlpy) (3.1.0)
Installing collected packages: JPype1, colorama, tweepy, beautifulsoup4, konlpy
Found existing installation: tweepy 3.6.0
Uninstalling tweepy-3.6.0:
Successfully uninstalled tweepy-3.6.0
Found existing installation: beautifulsoup4 4.6.3
Uninstalling beautifulsoup4-4.6.3:
Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed JPype1-1.0.2 beautifulsoup4-4.6.0 colorama-0.4.3 konlpy-0.5.2 tweepy-3.9.0
import konlpy
konlpy.__version__
'0.5.2'
데이터프레임은 2차원 리스트를 매개변수로 전달.
2차원이므로 행방향 인덱스(index)와 열방향 인덱스(column)가 존재. 즉, 행과 열을 가지는 자료구조임.
시리즈가 인덱스(index)와 값(values)으로 구성된다면, 데이터프레임은 열(columns)까지 추가되어 열(columns), 인덱스(index), 값(values)으로 구성됨.
df.head(n) - 앞 부분을 n개만 보기
import pandas as pd
sr = pd.Series([17000, 18000, 1000, 5000],
index=["피자", "치킨", "콜라", "맥주"])
print(sr)
print(sr.values)
print(sr.index)
피자 17000 치킨 18000 콜라 1000 맥주 5000 dtype: int64 [17000 18000 1000 5000] Index(['피자', '치킨', '콜라', '맥주'], dtype='object')
values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
index = ['one', 'two', 'three']
columns = ['A', 'B', 'C']
df = pd.DataFrame(values, index=index, columns=columns)
print(df)
A B C one 1 2 3 two 4 5 6 three 7 8 9
print(df.index)
print(df.columns)
print(df.values)
Index(['one', 'two', 'three'], dtype='object') Index(['A', 'B', 'C'], dtype='object') [[1 2 3] [4 5 6] [7 8 9]]
data = [
['1000', 'Steve', 90.72],
['1001', 'James', 78.09],
['1002', 'Doyeon', 98.43],
['1003', 'Jane', 64.19],
['1004', 'Pilwoong', 81.30],
['1005', 'Tony', 99.14],
]
df = pd.DataFrame(data)
print(df)
0 1 2 0 1000 Steve 90.72 1 1001 James 78.09 2 1002 Doyeon 98.43 3 1003 Jane 64.19 4 1004 Pilwoong 81.30 5 1005 Tony 99.14
df = pd.DataFrame(data, columns=['학번', '이름', '점수'])
print(df)
학번 이름 점수 0 1000 Steve 90.72 1 1001 James 78.09 2 1002 Doyeon 98.43 3 1003 Jane 64.19 4 1004 Pilwoong 81.30 5 1005 Tony 99.14
data = { '학번' : ['1000', '1001', '1002', '1003', '1004', '1005'],
'이름' : [ 'Steve', 'James', 'Doyeon', 'Jane', 'Pilwoong', 'Tony'],
'점수': [90.72, 78.09, 98.43, 64.19, 81.30, 99.14]}
df = pd.DataFrame(data)
print(df)
학번 이름 점수 0 1000 Steve 90.72 1 1001 James 78.09 2 1002 Doyeon 98.43 3 1003 Jane 64.19 4 1004 Pilwoong 81.30 5 1005 Tony 99.14
print(df.head(3))
print(df.tail(3))
print(df['학번'])
학번 이름 점수
0 1000 Steve 90.72
1 1001 James 78.09
2 1002 Doyeon 98.43
학번 이름 점수
3 1003 Jane 64.19
4 1004 Pilwoong 81.30
5 1005 Tony 99.14
0 1000
1 1001
2 1002
3 1003
4 1004
5 1005
Name: 학번, dtype: object
print(df.index)
RangeIndex(start=0, stop=6, step=1)
import numpy as np
a = np.array([1, 2, 3, 4, 5]) #리스트를 가지고 1차원 배열 생성
print(type(a))
print(a)
<class 'numpy.ndarray'> [1 2 3 4 5]
b = np.array([[10, 20, 30], [ 60, 70, 80]])
print(b)
[[10 20 30] [60 70 80]]
print(a.ndim) #차원 출력
print(a.shape) #크기 출력
print(b.ndim) #차원 출력
print(b.shape) #크기 출력
1 (5,) 2 (2, 3)
a = np.zeros((2,3)) # 모든값이 0인 2x3 배열 생성.
print(a)
[[0. 0. 0.] [0. 0. 0.]]
a = np.ones((2,3)) # 모든값이 1인 2x3 배열 생성.
print(a)
[[1. 1. 1.] [1. 1. 1.]]
a = np.full((2,2), 7) # 모든 값이 특정 상수인 배열 생성. 이 경우에는 7.
print(a)
[[7 7] [7 7]]
a = np.eye(3) # 대각선으로는 1이고 나머지는 0인 2차원 배열을 생성.
print(a)
[[1. 0. 0.] [0. 1. 0.] [0. 0. 1.]]
a = np.random.random((2,2)) # 임의의 값으로 채워진 배열 생성
print(a)
[[0.2902947 0.2354292 ] [0.77328122 0.21608107]]
a = np.arange(10) #0부터 9까지
print(a)
[0 1 2 3 4 5 6 7 8 9]
a = np.arange(1, 10, 2) #1부터 9까지 +2씩 적용되는 범위
print(a)
[1 3 5 7 9]
a = np.array(np.arange(30)).reshape((5,6))
print(a)
[[ 0 1 2 3 4 5] [ 6 7 8 9 10 11] [12 13 14 15 16 17] [18 19 20 21 22 23] [24 25 26 27 28 29]]
a = np.array([[1, 2, 3], [4, 5, 6]])
print(a)
[[1 2 3] [4 5 6]]
b=a[0:2, 0:2]
print(b)
[[1 2] [4 5]]
b=a[0, :] # 첫번째 행 출력
print(b)
[1 2 3]
b=a[:, 1] # 두번째 열 출력
print(b)
[2 5]
a = np.array([[1,2], [4,5], [7,8]])
print(a)
b = a[[2, 1],[1, 0]] # a[[row2, row1],[col1, col0]]을 의미함.
print(b)
[[1 2] [4 5] [7 8]] [8 4]
x = np.array([1,2,3])
y = np.array([4,5,6])
# b = x + y # 각 요소에 대해서 더함
b = np.add(x, y)
print(b)
[5 7 9]
# b = x - y # 각 요소에 대해서 빼기
b = np.subtract(x, y)
print(b)
[-3 -3 -3]
# b = b * x # 각 요소에 대해서 곱셈
b = np.multiply(b, x) # 와 동일함
print(b)
[-3 -6 -9]
# b = b / x # 각 요소에 대해서 나눗셈
b = np.divide(b, x) # 와 동일함
print(b)
[-3. -3. -3.]
a = np.array([[1,2],[3,4]])
b = np.array([[5,6],[7,8]])
c = np.dot(a, b) # 행렬곱
d = a * b # 각 위치의 곱
print(c)
print(d)
[[19 22] [43 50]] [[ 5 12] [21 32]]
import matplotlib
%matplotlib inline # 주피터 노트북에 그림을 표시하도록 지정
import matplotlib.pyplot as plt
plt.title('test')
plt.plot([1,2,3,4],[2,4,8,6])
plt.show()
plt.title('test')
plt.plot([1,2,3,4],[2,4,8,6])
plt.xlabel('hours')
plt.ylabel('score')
plt.show()
plt.title('students')
plt.plot([1,2,3,4],[2,4,8,6]) # (1, 2), (2, 4), (3, 8), (4, 6)
plt.plot([1.5,2.5,3.5,4.5],[3,5,8,10]) #라인 새로 추가 (1.5, 3), (2.5, 5), (3.5, 8), (4.5, 10)
plt.xlabel('hours')
plt.ylabel('score')
plt.legend(['A student', 'B student']) #범례 삽입
plt.show()
import pandas as pd
import pandas_profiling
data = pd.read_csv('/spam.csv',encoding='latin1')
data[:5]
| v1 | v2 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | |
|---|---|---|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... | NaN | NaN | NaN |
| 1 | ham | Ok lar... Joking wif u oni... | NaN | NaN | NaN |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | NaN | NaN | NaN |
| 3 | ham | U dun say so early hor... U c already then say... | NaN | NaN | NaN |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... | NaN | NaN | NaN |
pr=data.profile_report()
pr
전처리의 종류: 토큰화(tokenization), 정제(cleaning), 정규화(normalization)
토큰화란? 주어진 코퍼스(corpus)에서 토큰(token)이라 불리는 단위로 나누는 작업. 토큰의 단위가 상황에 따라 다르지만, 보통 의미있는 단위(한국어에서의 최소단위: 형태소)로 토큰을 정의합니다.
토큰의 기준을 단어(word)로 하는 경우.
ex) 구두점(punctuation)을 기준으로 토큰화
보통 토큰화 작업은 단순히 구두점이나 특수문자를 전부 제거하는 정제(cleaning) 작업을 수행하는 것만으로 해결되지 않음. 구두점이나 특수문자를 전부 제거하면 토큰이 의미를 잃어버리는 경우가 발생하기도 한다. 심지어 띄어쓰기 단위로 자르면 사실상 단어 토큰이 구분되는 영어와 달리, 한국어는 띄어쓰기만으로는 단어 토큰을 구분하기 어려움.
!pip install nltk
Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (3.2.5) Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk) (1.15.0)
nltk.download('punkt')
from nltk.tokenize import word_tokenize
print(word_tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip. ['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']
from nltk.tokenize import WordPunctTokenizer
print(WordPunctTokenizer().tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))
['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']
from tensorflow.keras.preprocessing.text import text_to_word_sequence
print(text_to_word_sequence("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))
["don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'mr', "jone's", 'orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']
토큰화 작업을 단순하게 코퍼스에서 구두점을 제외하고 공백 기준으로 잘라내는 작업이라고 간주할 수는 없다.
온점(.)과 같은 경우는 문장의 경계를 알 수 있는데 도움이 되므로 단어를 뽑아낼 때, 온점(.)을 제외하지 않을 수 있다.
단어 자체에서 구두점을 갖고 있는 경우도 있음. m.p.h, Ph.D, AT&T.
달러($)나 슬래시(/)로 예를 들어보면, $45.55와 같은 가격을 의미 하기도 하고, 01/02/06은 날짜를 의미하기도 함. 보통 이런 경우 45와 55로 따로 분류해서는 안됨.
숫자 사이에 컴마(,)가 들어가는 경우도 있습니다. 보통 수치나 가격을 표현할 때는 123,456,789, 100,000₩ 와 같이 세 자리 단위로 컴마가 들어갑니다.
토큰화 작업에서 종종 영어권 언어의 어퍼스트로피(')는 압축된 단어를 다시 펼치는 역할을 하기도 한다. 예를 들어 what're는 what are의 줄임말이며, we're는 we are의 줄임말임. 이 예에서 re를 접어(clitic)라고 하는데, 즉 단어가 줄임말로 쓰일 때 생기는 형태이다. I am을 줄인 I'm에서, m이 접어임.
New York이라는 단어나 rock 'n' roll은 하나의 단어이지만 중간에 띄어쓰기가 존재한다. 사용 용도에 따라서, 하나의 단어 사이에 띄어쓰기가 있는 경우에도 하나의 토큰으로 봐야하는 경우도 있을 수 있으므로, 토큰화 작업은 저러한 단어를 하나로 인식할 수 있는 능력이 필요.
from nltk.tokenize import TreebankWordTokenizer
tokenizer=TreebankWordTokenizer()
text="Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."
print(tokenizer.tokenize(text))
['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']
토큰의 단위가 문장(sentence)의 경우 문장토큰화 (sentence tokenization) 또는 문장 분류(sentence segmentation)라고 한다.
코퍼스가 정제되지 않은 상태일 때, 코퍼스는 문장 단위로 구분되어있지 않을 가능성이 높다. 이를 사용하고자 하는 용도에 맞게 하기 위해서는 문장 토큰화가 필요할 수 있음.
!나 ?는 문장의 구분을 위한 꽤 명확한 구분자(boundary) 역할을 하지만 온점은 문장의 끝이 아니더라도 등장할 수 있기 때문에 문장단위의 기준은 항상 달라질 수 있다.
from nltk.tokenize import sent_tokenize
text="His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to mae sure no one was near."
print(sent_tokenize(text))
['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to mae sure no one was near.']
from nltk.tokenize import sent_tokenize
text="I am actively looking for Ph.D. students. and you are a Ph.D student."
print(sent_tokenize(text))
['I am actively looking for Ph.D. students.', 'and you are a Ph.D student.']
!pip install kss # korean sentence splitter
Collecting kss Downloading https://files.pythonhosted.org/packages/fc/bb/4772901b3b934ac204f32a0bd6fc0567871d8378f9bbc7dd5fd5e16c6ee7/kss-1.3.1.tar.gz Building wheels for collected packages: kss Building wheel for kss (setup.py) ... done Created wheel for kss: filename=kss-1.3.1-cp36-cp36m-linux_x86_64.whl size=251531 sha256=b4d4d83c5665e9e9aaa80b69dbc9e63b3ebc96b683e1fddb8d0993b6899e24a8 Stored in directory: /root/.cache/pip/wheels/8b/98/d1/53f75f89925cd95779824778725ee3fa36e7aa55ed26ad54a8 Successfully built kss Installing collected packages: kss Successfully installed kss-1.3.1
import kss
text='딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어려워요. 농담아니에요. 이제 해보면 알걸요?'
print(kss.split_sentences(text))
['딥 러닝 자연어 처리가 재미있기는 합니다.', '그런데 문제는 영어보다 한국어로 할 때 너무 어려워요.', '농담아니에요.', '이제 해보면 알걸요?']
두 개의 클래스를 머신러닝을 통해 분류한다. 이 경우 약어사전(abbreivation)이 유용하게 사용됨.
문장 토큰화를 수행하는 오픈 소스로는 NLTK, OpenNLP, 스탠포드 CoreNLP, splitta, LingPipe 등이 있다.
문장 토큰화 규칙을 짤 때, 발생할 수 있는 여러가지 예외사항을 다룬 참고 링크
영어는 New York과 같은 합성어나 he's 와 같이 줄임말에 대한 예외처리만 한다면, 띄어쓰기(whitespace)를 기준으로 하는 띄어쓰기 토큰화를 수행해도 단어 토큰화가 잘 작동한다.
반면 한국어는 교착어이기 때문에, 띄어쓰기 단어가 되는 '어절' 토큰화는 한국어 NLP에서 지양된다.
한국어에는 '조사'가 존재한다.
'그가', '그에게', '그를', '그와', '그는'과 같이 다양한 조사가 띄어쓰기 없이 바로 붙게되는데, 이 경우 정제 시 같은 단어임에도 다른 단어로 인식됨. 따라서 조사까지 분리해 줄 필요가 있다.
한국어 토큰화에서는 형태소(morpheme)란 개념을 반드시 이해해야 한다. 형태소(morpheme)란 "뜻을 가진 가장 작은 말의 단위"이며 형태소는 자립 형태소와 의존 형태소로 분류할 수 있다.
자립 형태소 : 접사, 어미, 조사와 상관없이 자립하여 사용할 수 있는 형태소. 그 자체로 단어가 된다. 체언(명사, 대명사, 수사), 수식언(관형사, 부사), 감탄사 등이 있다.
의존 형태소 : 다른 형태소와 결합하여 사용되는 형태소. 접사, 어미, 조사, 어간를 말한다.
ex) 문장 : 에디가 딥러닝책을 읽었다
이를 형태소 단위로 분해하면 다음과 같습니다.
자립 형태소 : 에디, 딥러닝책
이를 통해 유추할 수 있는 것은 한국어에서 영어에서의 단어 토큰화와 유사한 형태를 얻으려면 어절 토큰화가 아니라 형태소 토큰화를 수행해야한다는 겁니다.
한국어는 띄어쓰기가 영어보다 잘 지켜지지 않는다.
한국어는 영어권 언어와 비교하여 띄어쓰기가 어렵고, 또 잘 지켜지지 않는 경향이 있는데 한국어의 경우 띄어쓰기가 지켜지지 않아도 글을 쉽게 이해할 수 있기 때문이다.
ex1) 제가이렇게띄어쓰기를전혀하지않고글을썼다고하더라도글을이해할수있습니다.
ex2) Tobeornottobethatisthequestion
영어의 경우 띄어쓰기를 하지 않으면 손쉽게 알아보기 어려운 문장들이 생긴다. 이는 한국어(모아쓰기 방식)와 영어(풀어쓰기 방식)라는 언어적 특성차이에 기인함.
단어는 표기는 같지만, 품사에 따라서 단어의 의미가 달라지기도 한다. 예를 들어 'fly'는 동사로는 '날다'이지만 명사로는 '파리'라는 의미인 것처럼 한국어에서도 '못'이라는 단어는 명사로서는 "망치를 사용해서 목재 따위를 고정하는 물건"을 의미하는 반면 부사로서의 '못'은 '먹는다', '달린다'와 같은 동작 동사를 할 수 없다는 의미이다.
즉, 단어의 의미를 제대로 파악하기 위해서는 해당 단어가 어떤 품사인지 알아볼 필요가 있다. 이렇게 해당 단어의 품사를 알아보는 작업을 품사 태깅(part-of-speech tagging)이라고 한다.
KoNLPy의 종류
from nltk.tokenize import word_tokenize
text="I am actively looking for Ph.D. students. and you are a Ph.D. student."
print(word_tokenize(text))
['I', 'am', 'actively', 'looking', 'for', 'Ph.D.', 'students', '.', 'and', 'you', 'are', 'a', 'Ph.D.', 'student', '.']
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
x=word_tokenize(text)
pos_tag(x)
[nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] /root/nltk_data... [nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.
[('I', 'PRP'),
('am', 'VBP'),
('actively', 'RB'),
('looking', 'VBG'),
('for', 'IN'),
('Ph.D.', 'NNP'),
('students', 'NNS'),
('.', '.'),
('and', 'CC'),
('you', 'PRP'),
('are', 'VBP'),
('a', 'DT'),
('Ph.D.', 'NNP'),
('student', 'NN'),
('.', '.')]
from konlpy.tag import Okt
okt=Okt()
print(okt.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print(okt.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print(okt.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']
[('열심히', 'Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가봐요', 'Verb')]
['코딩', '당신', '연휴', '여행']
from konlpy.tag import Kkma
kkma=Kkma()
print(kkma.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print(kkma.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print(kkma.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가보', '아요']
[('열심히', 'MAG'), ('코딩', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('당신', 'NP'), (',', 'SP'), ('연휴', 'NNG'), ('에', 'JKM'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가보', 'VV'), ('아요', 'EFN')]
['코딩', '당신', '연휴', '여행']
토크나이징과 같이 텍스트 전처리의 유형이다.
USA와 US, uh-huh와 uhhuh 등 같은 의미이지만 표기형태만 다른 단어들은 하나의 단어로 정규화 하는 작업이 필요함. 그 방법으로는 어간 추출(stemming)과 표제어 추출(lemmatizaiton) 등이 있다.
대소문자의 구분. 영어의 경우 bus와 Bus처럼 대소문자가 다른 경우도 같은 단어로 인식하는 정규화 작업이 필요함.
길이가 짧은 단어
영어권 언어에서는 길이가 짧은 단어를 삭제하는 것만으로도 어느정도 자연어 처리에서 크게 의미가 없는 단어들을 제거하는 효과를 볼 수 있다. 영어권 언어에서 길이가 짧은 단어들은 대부분 불용어에 해당되기 때문. 여기에는 단어가 아닌 구두점들까지도 한꺼번에 제거하기 위함도 있다.
영어 단어의 평균 길이는 6~7 정도이며, 한국어 단어의 평균 길이는 2~3 정도로 추정됨.영어 단어의 길이가 한국어 단어의 길이보다는 평균적으로 길다는 점입니다.
한국어 단어는 한자어가 많고, 한 글자만으로도 이미 의미를 가진 경우가 많기 때문인데 예를 들어 '학교'라는 단어는, 배울 학(學)과 학교 교(校)로 각 글자들이 이미 함축적인 의미를 갖고있다. 하지만 영어의 경우에는 학교라는 단어를 표현하기 위해서는 s, c, h, o, o, l이라는 총 6개의 글자가 필요함.
이러한 특성으로 영어는 길이가 2~3 이하인 단어를 제거하는 것만으로도 크게 의미를 갖지 못하는 단어를 줄일 수 있음. 길이가 1인 단어에서는 주로 의미를 갖지 못하는 단어인 관사 'a'와 주어로 쓰이는 'I'가 제거됨. 길이가 2인 단어에서는 주로 it, at, to, on, in, by 등과 같은 불용어들이 주로 제거됨.
import re
text = "I was wondering if anyone out there could enlighten me on this car."
shortword = re.compile(r'\W*\b\w{1,2}\b') # 정규표현식 \b 낱말과 낱말의 경계 사이에 \W* 낱말이 아닌 것들 중 \w{1,2} 길이가 1-2인 영숫자 제거
print(shortword.sub('', text))
was wondering anyone out there could enlighten this car.
어간 추출(Stemming)과 표제어 추출(Lemmatization)
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
n=WordNetLemmatizer()
words=['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
print([n.lemmatize(w) for w in words])
## 출력결과
#### ['policy', 'doing', 'organization', 'have', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']
## dy와 ha 등 표제어 추출이 정확하지 않은 이유는 본래 단어의 품사 정보를 정확히 알 수 없기 때문임.
## 때문에 lemmatize에서 품사 정보 역시 입력해준다면 정확한 표제어를 추출할 수 있게 됨.
[nltk_data] Downloading package wordnet to /root/nltk_data... [nltk_data] Unzipping corpora/wordnet.zip. ['policy', 'doing', 'organization', 'have', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']
n.lemmatize('dies', 'v')
'die'
n.lemmatize('watched', 'v')
'watch'
n.lemmatize('has', 'v')
'have'
- Stemming (어간 추출 결과)
- am → am
- the going → the go
- having → hav
- Lemmatization (표제어 추출 결과)
- am → be
- the going → the going
- having → have
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
s = PorterStemmer()
text="This was not the map we found in Billy Bones's chest, but an accurate copy, complete in all things--names and heights and soundings--with the single exception of the red crosses and the written notes."
words=word_tokenize(text)
print(words)
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip. ['This', 'was', 'not', 'the', 'map', 'we', 'found', 'in', 'Billy', 'Bones', "'s", 'chest', ',', 'but', 'an', 'accurate', 'copy', ',', 'complete', 'in', 'all', 'things', '--', 'names', 'and', 'heights', 'and', 'soundings', '--', 'with', 'the', 'single', 'exception', 'of', 'the', 'red', 'crosses', 'and', 'the', 'written', 'notes', '.']
print([s.stem(w) for w in words])
['thi', 'wa', 'not', 'the', 'map', 'we', 'found', 'in', 'billi', 'bone', "'s", 'chest', ',', 'but', 'an', 'accur', 'copi', ',', 'complet', 'in', 'all', 'thing', '--', 'name', 'and', 'height', 'and', 'sound', '--', 'with', 'the', 'singl', 'except', 'of', 'the', 'red', 'cross', 'and', 'the', 'written', 'note', '.']
words=['formalize', 'allowance', 'electricical']
print([s.stem(w) for w in words])
['formal', 'allow', 'electric']
from nltk.stem import PorterStemmer
s=PorterStemmer()
words=['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
print([s.stem(w) for w in words])
from nltk.stem import LancasterStemmer
l=LancasterStemmer()
words=['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
print([l.stem(w) for w in words])
['polici', 'do', 'organ', 'have', 'go', 'love', 'live', 'fli', 'die', 'watch', 'ha', 'start'] ['policy', 'doing', 'org', 'hav', 'going', 'lov', 'liv', 'fly', 'die', 'watch', 'has', 'start']
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')[:10]
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
example = "Family is not an important thing. It's everything."
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example)
result = []
for w in word_tokens:
if w not in stop_words:
result.append(w)
print(word_tokens)
print(result)
['Family', 'is', 'not', 'an', 'important', 'thing', '.', 'It', "'s", 'everything', '.'] ['Family', 'important', 'thing', '.', 'It', "'s", 'everything', '.']
정규표현식 문법들을 외운 이후에 https://regexcrossword.com/challenges/tutorial/puzzles/1
위 사이트 문제들을 풀면서 감을 익히는 것이 좋음.
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer("[\w]+") # 문자+숫자가 1개 이상인 경우, 즉 워드토크나이징 명령.
print(tokenizer.tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop"))
['Don', 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'Mr', 'Jone', 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer("[\s]+", gaps=True) # 스페이스 1개 이상인 경우, 즉 공백 토크나이징
print(tokenizer.tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop"))
["Don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name,', 'Mr.', "Jone's", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."
# 문장 토큰화
text = sent_tokenize(text)
print(text)
['A barber is a person.', 'a barber is good person.', 'a barber is huge person.', 'he Knew A Secret!', 'The Secret He Kept is huge secret.', 'Huge secret.', 'His barber kept his word.', 'a barber kept his word.', 'His barber kept his secret.', 'But keeping and keeping such a huge secret to himself was driving the barber crazy.', 'the barber went up a huge mountain.']
# 정제와 단어 토큰화
vocab = {} # 파이썬의 dictionary 자료형
sentences = []
stop_words = set(stopwords.words('english'))
for i in text:
sentence = word_tokenize(i) # 단어 토큰화를 수행합니다.
result = []
for word in sentence:
word = word.lower() # 모든 단어를 소문자화하여 단어의 개수를 줄입니다.
if word not in stop_words: # 단어 토큰화 된 결과에 대해서 불용어를 제거합니다.
if len(word) > 2: # 단어 길이가 2이하인 경우에 대하여 추가로 단어를 제거합니다.
result.append(word)
if word not in vocab:
vocab[word] = 0
vocab[word] += 1
sentences.append(result)
print(sentences)
print(vocab)
print(vocab["barber"]) # 'barber'라는 단어의 빈도수 출력
[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]
{'barber': 8, 'person': 3, 'good': 1, 'huge': 5, 'knew': 1, 'secret': 6, 'kept': 4, 'word': 2, 'keeping': 2, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1}
8
# 빈도수 정렬
vocab_sorted = sorted(vocab.items(), key = lambda x:x[1], reverse = True)
print(vocab_sorted)
[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2), ('good', 1), ('knew', 1), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)]
# 빈도순으로 인덱스 부여
word_to_index = {}
i=0
for (word, frequency) in vocab_sorted :
if frequency > 1 : # 정제(Cleaning) 챕터에서 언급했듯이 빈도수가 적은 단어는 제외한다.
i=i+1
word_to_index[word] = i
print(word_to_index)
{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7}
vocab_size = 5
words_frequency = [w for w,c in word_to_index.items() if c >= vocab_size + 1] # 인덱스가 5 초과인 단어 제거
for w in words_frequency:
del word_to_index[w] # 해당 단어에 대한 인덱스 정보를 삭제
print(word_to_index)
{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}
# 상위 5개 단어에 포함되지 않으면 OOV 라는 인덱스 부여. 즉 OOV = 6
word_to_index['OOV'] = len(word_to_index) + 1 # OOV: Out of Vocabulary
# 정수 인코딩
# ex) sentences[0] = ['barber', 'person']
# -> [1, 5]
encoded = []
for s in sentences:
temp = []
for w in s:
try:
temp.append(word_to_index[w])
except KeyError:
temp.append(word_to_index['OOV'])
encoded.append(temp)
print(encoded)
[[1, 5], [1, 6, 5], [1, 3, 5], [6, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [6, 6, 3, 2, 6, 1, 6], [1, 6, 3, 6]]
from collections import Counter
print(sentences)
words = sum(sentences, []) # 이차원 리스트를 일차원으로 변경
# 위 작업은 words = np.hstack(sentences)로도 수행 가능.
print(words)
vocab = Counter(words) # 파이썬의 Counter 모듈을 이용하면 단어의 모든 빈도를 쉽게 계산할 수 있습니다.
print(vocab)
print(vocab["barber"]) # 'barber'라는 단어의 빈도수 출력
[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]
['barber', 'person', 'barber', 'good', 'person', 'barber', 'huge', 'person', 'knew', 'secret', 'secret', 'kept', 'huge', 'secret', 'huge', 'secret', 'barber', 'kept', 'word', 'barber', 'kept', 'word', 'barber', 'kept', 'secret', 'keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy', 'barber', 'went', 'huge', 'mountain']
Counter({'barber': 8, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1})
8
vocab_size = 5
vocab = vocab.most_common(vocab_size) # 등장 빈도수가 높은 상위 5개의 단어만 저장
vocab
[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]
# 빈도순 인덱스 부여
word_to_index = {}
i = 0
for (word, frequency) in vocab :
i = i+1
word_to_index[word] = i
print(word_to_index)
{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}
from nltk import FreqDist
import numpy as np
# np.hstack으로 문장 구분을 제거하여 입력으로 사용 . ex) ['barber', 'person', 'barber', 'good' ... 중략 ...
vocab = FreqDist(np.hstack(sentences))
print(vocab["barber"]) # 'barber'라는 단어의 빈도수 출력
vocab_size = 5
vocab = vocab.most_common(vocab_size) # 등장 빈도수가 높은 상위 5개의 단어만 저장
word_to_index = {word[0] : index + 1 for index, word in enumerate(vocab)} # 빈도순 인덱스 부여
print(word_to_index)
8
{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}
from tensorflow.keras.preprocessing.text import Tokenizer
print(sentences)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences) # fit_on_texts()안에 코퍼스를 입력으로 하면 빈도수를 기준으로 단어 집합을 생성한다.
print(tokenizer.word_index) # 빈도순 인덱스
print(tokenizer.word_counts) # 빈도순
print(tokenizer.texts_to_sequences(sentences)) # 부여된 인덱스로 변환
[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]
{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7, 'good': 8, 'knew': 9, 'driving': 10, 'crazy': 11, 'went': 12, 'mountain': 13}
OrderedDict([('barber', 8), ('person', 3), ('good', 1), ('huge', 5), ('knew', 1), ('secret', 6), ('kept', 4), ('word', 2), ('keeping', 2), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)])
[[1, 5], [1, 8, 5], [1, 3, 5], [9, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 10, 1, 11], [1, 12, 3, 13]]
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 1) # 상위 5개 단어만 사용 # texts_to_sequences를 사용할 때 적용됨.
tokenizer.fit_on_texts(sentences)
print(tokenizer.word_index)
print(tokenizer.word_counts)
print(tokenizer.texts_to_sequences(sentences))
{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7, 'good': 8, 'knew': 9, 'driving': 10, 'crazy': 11, 'went': 12, 'mountain': 13}
OrderedDict([('barber', 8), ('person', 3), ('good', 1), ('huge', 5), ('knew', 1), ('secret', 6), ('kept', 4), ('word', 2), ('keeping', 2), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)])
[[1, 5], [1, 5], [1, 3, 5], [2], [2, 4, 3, 2], [3, 2], [1, 4], [1, 4], [1, 4, 2], [3, 2, 1], [1, 3]]
tokenizer = Tokenizer() # num_words를 여기서는 지정하지 않은 상태
tokenizer.fit_on_texts(sentences)
vocab_size = 5
words_frequency = [w for w,c in tokenizer.word_index.items() if c >= vocab_size + 1] # 인덱스가 5 초과인 단어 제거
for w in words_frequency:
del tokenizer.word_index[w] # 해당 단어에 대한 인덱스 정보를 삭제
del tokenizer.word_counts[w] # 해당 단어에 대한 카운트 정보를 삭제
print(tokenizer.word_index)
print(tokenizer.word_counts)
print(tokenizer.texts_to_sequences(sentences))
{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}
OrderedDict([('barber', 8), ('person', 3), ('huge', 5), ('secret', 6), ('kept', 4)])
[[1, 5], [1, 5], [1, 3, 5], [2], [2, 4, 3, 2], [3, 2], [1, 4], [1, 4], [1, 4, 2], [3, 2, 1], [1, 3]]
vocab_size = 5
tokenizer = Tokenizer(num_words = vocab_size + 2, oov_token = 'OOV')
# 빈도수 상위 5개 단어만 사용. 숫자 0과 OOV를 고려해서 단어 집합의 크기는 +2
tokenizer.fit_on_texts(sentences)
print('단어 OOV의 인덱스 : {}'.format(tokenizer.word_index['OOV']))
print(tokenizer.texts_to_sequences(sentences))
단어 OOV의 인덱스 : 1 [[2, 6], [2, 1, 6], [2, 4, 6], [1, 3], [3, 5, 4, 3], [4, 3], [2, 5, 1], [2, 5, 1], [2, 5, 3], [1, 1, 4, 3, 1, 2, 1], [2, 1, 4, 1]]
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
sentences = [['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences) # fit_on_texts()안에 코퍼스를 입력으로 하면 빈도수를 기준으로 단어 집합을 생성한다.
encoded = tokenizer.texts_to_sequences(sentences)
print(encoded) # 케라스를 이용한 정수인코딩
[[1, 5], [1, 8, 5], [1, 3, 5], [9, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 10, 1, 11], [1, 12, 3, 13]]
max_len = max(len(item) for item in encoded)
print(max_len)
7
for item in encoded: # 각 문장에 대해서
while len(item) < max_len: # max_len보다 작으면
item.append(0)
padded_np = np.array(encoded)
padded_np
array([[ 1, 5, 0, 0, 0, 0, 0],
[ 1, 8, 5, 0, 0, 0, 0],
[ 1, 3, 5, 0, 0, 0, 0],
[ 9, 2, 0, 0, 0, 0, 0],
[ 2, 4, 3, 2, 0, 0, 0],
[ 3, 2, 0, 0, 0, 0, 0],
[ 1, 4, 6, 0, 0, 0, 0],
[ 1, 4, 6, 0, 0, 0, 0],
[ 1, 4, 2, 0, 0, 0, 0],
[ 7, 7, 3, 2, 10, 1, 11],
[ 1, 12, 3, 13, 0, 0, 0]])
from tensorflow.keras.preprocessing.sequence import pad_sequences
encoded = tokenizer.texts_to_sequences(sentences)
print(encoded)
padded = pad_sequences(encoded)
padded
[[1, 5], [1, 8, 5], [1, 3, 5], [9, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 10, 1, 11], [1, 12, 3, 13]]
array([[ 0, 0, 0, 0, 0, 1, 5],
[ 0, 0, 0, 0, 1, 8, 5],
[ 0, 0, 0, 0, 1, 3, 5],
[ 0, 0, 0, 0, 0, 9, 2],
[ 0, 0, 0, 2, 4, 3, 2],
[ 0, 0, 0, 0, 0, 3, 2],
[ 0, 0, 0, 0, 1, 4, 6],
[ 0, 0, 0, 0, 1, 4, 6],
[ 0, 0, 0, 0, 1, 4, 2],
[ 7, 7, 3, 2, 10, 1, 11],
[ 0, 0, 0, 1, 12, 3, 13]], dtype=int32)
padded = pad_sequences(encoded, padding = 'post')
padded
array([[ 1, 5, 0, 0, 0, 0, 0],
[ 1, 8, 5, 0, 0, 0, 0],
[ 1, 3, 5, 0, 0, 0, 0],
[ 9, 2, 0, 0, 0, 0, 0],
[ 2, 4, 3, 2, 0, 0, 0],
[ 3, 2, 0, 0, 0, 0, 0],
[ 1, 4, 6, 0, 0, 0, 0],
[ 1, 4, 6, 0, 0, 0, 0],
[ 1, 4, 2, 0, 0, 0, 0],
[ 7, 7, 3, 2, 10, 1, 11],
[ 1, 12, 3, 13, 0, 0, 0]], dtype=int32)
(padded == padded_np).all()
True
# 제로패딩이 아닌 다른 인덱스번호 사용
last_value = len(tokenizer.word_index) + 1 # 단어 집합의 크기보다 1 큰 숫자를 사용
print(last_value)
14
padded = pad_sequences(encoded, padding = 'post', value = last_value)
padded
array([[ 1, 5, 14, 14, 14, 14, 14],
[ 1, 8, 5, 14, 14, 14, 14],
[ 1, 3, 5, 14, 14, 14, 14],
[ 9, 2, 14, 14, 14, 14, 14],
[ 2, 4, 3, 2, 14, 14, 14],
[ 3, 2, 14, 14, 14, 14, 14],
[ 1, 4, 6, 14, 14, 14, 14],
[ 1, 4, 6, 14, 14, 14, 14],
[ 1, 4, 2, 14, 14, 14, 14],
[ 7, 7, 3, 2, 10, 1, 11],
[ 1, 12, 3, 13, 14, 14, 14]], dtype=int32)
원-핫 이노딩의 두 가지 과정
#!pip install konlpy
from konlpy.tag import Okt
okt=Okt() # 구 트위터
token=okt.morphs("나는 자연어 처리를 배운다") # 센텐스 토크나이징
print(token)
['나', '는', '자연어', '처리', '를', '배운다']
# 문장이 길 때는 단어의 빈도수를 고려하여 빈도수 순으로 인코딩을 진행해야 함. 여기선 생략
word2index={}
for voca in token:
if voca not in word2index.keys():
word2index[voca]=len(word2index)
print(word2index)
{'나': 0, '는': 1, '자연어': 2, '처리': 3, '를': 4, '배운다': 5}
# 토ㅡㄴ을 입력하면 해당 토큰에 대한 원-핫 벡터를 만들어내는 함수
def one_hot_encoding(word, word2index):
one_hot_vector = [0]*(len(word2index))
index=word2index[word]
one_hot_vector[index]=1
return one_hot_vector
one_hot_encoding("자연어",word2index)
[0, 0, 1, 0, 0, 0]
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
text="나랑 점심 먹으러 갈래 점심 메뉴는 햄버거 갈래 갈래 햄버거 최고야"
t = Tokenizer()
t.fit_on_texts([text])
print(t.word_index) # 각 단어에 대한 인코딩 결과 출력.
sub_text="점심 먹으러 갈래 메뉴는 햄버거 최고야" # 단어집합(word_index)에 있는 단어들로만 구성된 서브텍스트 생성
encoded=t.texts_to_sequences([sub_text])[0] # texts_to_sequences 함수를 통해 정수인코딩
print(encoded)
{'갈래': 1, '점심': 2, '햄버거': 3, '나랑': 4, '먹으러': 5, '메뉴는': 6, '최고야': 7}
[2, 5, 1, 6, 3, 7]
one_hot = to_categorical(encoded) # 원-핫 인코딩
print(one_hot)
[[0. 0. 1. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 0. 0.] [0. 1. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 1. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 1.]]
위 두 가지의 단점을 보완하기 위해 단어의 잠재의미를 반영하여 다차원 공간에 벡터화 하는 기법이 있음.
팀드라이브 자료 참고
다음 단어 등장 확률
|: 조건부확률(conditional probability)
전체 단어 시퀀스 W의 확률은 모든 단어가 예측되고 나서야 알 수 있으므로 단어 시퀀스의 확률
두 확률 P(A), P(B)에 대해서 아래 공식이 성립
4개의 확률이 조건부 확률의 관계일 때
조건부 확률의 연쇄법칙
문장 'An adorable little boy is spreading smiles'의 확률 P(An adorable little boy is spreading smiles)
=
즉,
P(An adorable little boy is spreading smiles)= P(An)×P(adorable|An)×P(little|An adorable)×P(boy|An adorable little)×P(is|An adorable little boy) ×P(spreading|An adorable little boy is)×P(smiles|An adorable little boy is spreading)
SLM은 이전 단어로부터 카운트에 기반하여 다음 단어에 대한 확률을 계산한다.
An adorable little boy가 나왔을 때 다음 단어로 is가 나타날 확률 = P(is|An adorable little boy)
N-gram이나 backoff와 같은 여러 일반화(generalization) 기법들은 희소문제에 대한 근본적인 해결책이 되지 못하고 있다. 이것이 인공신경망 언어모델(8챕터)로 넘어가게 되는 이유가 됨.
코퍼스에서 애초에 n개의 단어 뭉치 단위로 끊고, 이것을 하나의 토큰으로 간주함.
unigrams : an, adorable, little, boy, is, spreading, smiles
bigrams : an adorable, adorable little, little boy, boy is, is spreading, spreading smiles
trigrams : an adorable little, adorable little boy, little boy is, boy is spreading, is spreading smiles
4-grams : an adorable little boy, adorable little boy is, little boy is spreading, boy is spreading smiles
4-gram을 이용한 언어모델에서 An adorable little boy is spreading 다음에 나올 단어를 예측하고 싶을 때, 등장할 단어로부터 n-1개의 단어, 즉 앞에 3개의 단어인 'boy is spreading'만을 고려한다.
from konlpy.tag import Okt
import re
okt=Okt()
token=re.sub("(\.)","","정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다.")
# 정규 표현식을 통해 온점을 제거하는 정제 작업입니다.
token=okt.morphs(token)
# OKT 형태소 분석기를 통해 토큰화 작업을 수행한 뒤에, token에다가 넣습니다.
word2index={}
bow=[]
for voca in token:
if voca not in word2index.keys():
word2index[voca]=len(word2index)
# token을 읽으면서, word2index에 없는 (not in) 단어는 새로 추가하고, 이미 있는 단어는 넘깁니다.
bow.insert(len(word2index)-1,1)
# BoW 전체에 전부 기본값 1을 넣어줍니다. 단어의 개수는 최소 1개 이상이기 때문입니다.
else:
index=word2index.get(voca)
# 재등장하는 단어의 인덱스를 받아옵니다.
bow[index]=bow[index]+1
# 재등장한 단어는 해당하는 인덱스의 위치에 1을 더해줍니다. (단어의 개수를 세는 것입니다.)
print(word2index)
bow
{'정부': 0, '가': 1, '발표': 2, '하는': 3, '물가상승률': 4, '과': 5, '소비자': 6, '느끼는': 7, '은': 8, '다르다': 9}
[1, 2, 1, 1, 2, 1, 1, 1, 1, 1]
from konlpy.tag import Okt
import re
okt=Okt()
token=re.sub("(\.)","","소비자는 주로 소비하는 상품을 기준으로 물가상승률을 느낀다.")
# 정규 표현식을 통해 온점을 제거하는 정제 작업입니다.
token=okt.morphs(token)
# OKT 형태소 분석기를 통해 토큰화 작업을 수행한 뒤에, token에다가 넣습니다.
word2index={}
bow=[]
for voca in token:
if voca not in word2index.keys():
word2index[voca]=len(word2index)
# token을 읽으면서, word2index에 없는 (not in) 단어는 새로 추가하고, 이미 있는 단어는 넘깁니다.
bow.insert(len(word2index)-1,1)
# BoW 전체에 전부 기본값 1을 넣어줍니다. 단어의 개수는 최소 1개 이상이기 때문입니다.
else:
index=word2index.get(voca)
# 재등장하는 단어의 인덱스를 받아옵니다.
bow[index]=bow[index]+1
# 재등장한 단어는 해당하는 인덱스의 위치에 1을 더해줍니다. (단어의 개수를 세는 것입니다.)
print(word2index)
bow
{'소비자': 0, '는': 1, '주로': 2, '소비': 3, '하는': 4, '상품': 5, '을': 6, '기준': 7, '으로': 8, '물가상승률': 9, '느낀다': 10}
[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]
from konlpy.tag import Okt
import re
okt=Okt()
token=re.sub("(\.)","","정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다. 소비자는 주로 소비하는 상품을 기준으로 물가상승률을 느낀다.")
# 정규 표현식을 통해 온점을 제거하는 정제 작업입니다.
token=okt.morphs(token)
# OKT 형태소 분석기를 통해 토큰화 작업을 수행한 뒤에, token에다가 넣습니다.
word2index={}
bow=[]
for voca in token:
if voca not in word2index.keys():
word2index[voca]=len(word2index)
# token을 읽으면서, word2index에 없는 (not in) 단어는 새로 추가하고, 이미 있는 단어는 넘깁니다.
bow.insert(len(word2index)-1,1)
# BoW 전체에 전부 기본값 1을 넣어줍니다. 단어의 개수는 최소 1개 이상이기 때문입니다.
else:
index=word2index.get(voca)
# 재등장하는 단어의 인덱스를 받아옵니다.
bow[index]=bow[index]+1
# 재등장한 단어는 해당하는 인덱스의 위치에 1을 더해줍니다. (단어의 개수를 세는 것입니다.)
print(word2index)
bow
{'정부': 0, '가': 1, '발표': 2, '하는': 3, '물가상승률': 4, '과': 5, '소비자': 6, '느끼는': 7, '은': 8, '다르다': 9, '는': 10, '주로': 11, '소비': 12, '상품': 13, '을': 14, '기준': 15, '으로': 16, '느낀다': 17}
[1, 2, 1, 2, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['you know I want your love. because I love you.']
vector = CountVectorizer()
print(vector.fit_transform(corpus).toarray()) # 코퍼스로부터 각 단어의 빈도 수를 기록한다.
print(vector.vocabulary_) # 각 단어의 인덱스가 어떻게 부여되었는지를 보여준다.
[[1 1 2 1 2 1]]
{'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}
from sklearn.feature_extraction.text import CountVectorizer
text=["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words=["the", "a", "an", "is", "not"])
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)
[[1 1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}
from sklearn.feature_extraction.text import CountVectorizer
text=["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words="english")
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)
[[1 1 1]]
{'family': 0, 'important': 1, 'thing': 2}
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
text=["Family is not an important thing. It's everything."]
sw = stopwords.words("english")
vect = CountVectorizer(stop_words =sw)
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
[[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}
import pandas as pd # 데이터프레임 사용을 위해
from math import log # IDF 계산을 위해
docs = [
'먹고 싶은 사과',
'먹고 싶은 바나나',
'길고 노란 바나나 바나나',
'저는 과일이 좋아요'
]
vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()
N = len(docs) # 총 문서의 수
def tf(t, d):
return d.count(t)
def idf(t):
df = 0
for doc in docs:
df += t in doc
return log(N/(df + 1))
def tfidf(t, d):
return tf(t,d)* idf(t)
result = []
for i in range(N): # 각 문서에 대해서 아래 명령을 수행
result.append([])
d = docs[i]
for j in range(len(vocab)):
t = vocab[j]
result[-1].append(tf(t, d))
tf_ = pd.DataFrame(result, columns = vocab)
tf_
| 과일이 | 길고 | 노란 | 먹고 | 바나나 | 사과 | 싶은 | 저는 | 좋아요 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 |
| 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| 2 | 0 | 1 | 1 | 0 | 2 | 0 | 0 | 0 | 0 |
| 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
result = []
for j in range(len(vocab)):
t = vocab[j]
result.append(idf(t))
idf_ = pd.DataFrame(result, index = vocab, columns = ["IDF"])
idf_
| IDF | |
|---|---|
| 과일이 | 0.693147 |
| 길고 | 0.693147 |
| 노란 | 0.693147 |
| 먹고 | 0.287682 |
| 바나나 | 0.287682 |
| 사과 | 0.693147 |
| 싶은 | 0.287682 |
| 저는 | 0.693147 |
| 좋아요 | 0.693147 |
result = []
for i in range(N):
result.append([])
d = docs[i]
for j in range(len(vocab)):
t = vocab[j]
result[-1].append(tfidf(t,d))
tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_
| 과일이 | 길고 | 노란 | 먹고 | 바나나 | 사과 | 싶은 | 저는 | 좋아요 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.000000 | 0.000000 | 0.000000 | 0.287682 | 0.000000 | 0.693147 | 0.287682 | 0.000000 | 0.000000 |
| 1 | 0.000000 | 0.000000 | 0.000000 | 0.287682 | 0.287682 | 0.000000 | 0.287682 | 0.000000 | 0.000000 |
| 2 | 0.000000 | 0.693147 | 0.693147 | 0.000000 | 0.575364 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3 | 0.693147 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.693147 | 0.693147 |
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
'you know I want your love',
'I like you',
'what should I do ',
]
vector = CountVectorizer()
print(vector.fit_transform(corpus).toarray()) # 코퍼스로부터 각 단어의 빈도 수를 기록한다.
print(vector.vocabulary_) # 각 단어의 인덱스가 어떻게 부여되었는지를 보여준다.
[[0 1 0 1 0 1 0 1 1]
[0 0 1 0 0 0 0 1 0]
[1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
'you know I want your love',
'I like you',
'what should I do ',
]
tfidfv = TfidfVectorizer().fit(corpus)
print(tfidfv.transform(corpus).toarray())
print(tfidfv.vocabulary_)
[[0. 0.46735098 0. 0.46735098 0. 0.46735098
0. 0.35543247 0.46735098]
[0. 0. 0.79596054 0. 0. 0.
0. 0.60534851 0. ]
[0.57735027 0. 0. 0. 0.57735027 0.
0.57735027 0. 0. ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}
from numpy import dot
from numpy.linalg import norm
import numpy as np
def cos_sim(A, B):
return dot(A, B)/(norm(A)*norm(B))
doc1=np.array([0,1,1,1])
doc2=np.array([1,0,1,1])
doc3=np.array([2,0,2,2])
print(cos_sim(doc1, doc2)) #문서1과 문서2의 코사인 유사도
print(cos_sim(doc1, doc3)) #문서1과 문서3의 코사인 유사도
print(cos_sim(doc2, doc3)) #문서2과 문서3의 코사인 유사도
0.6666666666666667 0.6666666666666667 1.0000000000000002
import pandas as pd
data = pd.read_csv('/content/movies_metadata.csv', low_memory=False)
# 예를 들어 윈도우 바탕화면에 해당 파일을 위치시킨 저자의 경우
# pd.read_csv(r'C:\Users\USER\Desktop\movies_metadata.csv', low_memory=False)
data.head(2)
data=data.head(20000)
print(data['overview'].isnull().sum())
data['overview'] = data['overview'].fillna('') # NULL을 ''로 변경
135
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['overview'])
# overview에 대해서 tf-idf 수행
print(tfidf_matrix.shape) # 2만개의 영화를 위해 47,487개의 단어가 사용되었음.
(20000, 47487)
# 코사인 유사도 구하기
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim
array([[1. , 0.01575748, 0. , ..., 0. , 0. ,
0. ],
[0.01575748, 1. , 0.04907345, ..., 0. , 0. ,
0. ],
[0. , 0.04907345, 1. , ..., 0. , 0. ,
0. ],
...,
[0. , 0. , 0. , ..., 1. , 0. ,
0.08375766],
[0. , 0. , 0. , ..., 0. , 1. ,
0. ],
[0. , 0. , 0. , ..., 0.08375766, 0. ,
1. ]])
indices = pd.Series(data.index, index=data['title']).drop_duplicates()
print(indices.head())
title Toy Story 0 Jumanji 1 Grumpier Old Men 2 Waiting to Exhale 3 Father of the Bride Part II 4 dtype: int64
idx = indices['Father of the Bride Part II']
print(idx)
4
def get_recommendations(title, cosine_sim=cosine_sim):
# 선택한 영화의 타이틀로부터 해당되는 인덱스를 받아옵니다. 이제 선택한 영화를 가지고 연산할 수 있습니다.
idx = indices[title]
# 모든 영화에 대해서 해당 영화와의 유사도를 구합니다.
sim_scores = list(enumerate(cosine_sim[idx]))
# 유사도에 따라 영화들을 정렬합니다.
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# 가장 유사한 10개의 영화를 받아옵니다.
sim_scores = sim_scores[1:11]
# 가장 유사한 10개의 영화의 인덱스를 받아옵니다.
movie_indices = [i[0] for i in sim_scores]
# 가장 유사한 10개의 영화의 제목을 리턴합니다.
return data['title'].iloc[movie_indices]
get_recommendations('The Dark Knight Rises')
12481 The Dark Knight 150 Batman Forever 1328 Batman Returns 15511 Batman: Under the Red Hood 585 Batman 9230 Batman Beyond: Return of the Joker 18035 Batman: Year One 19792 Batman: The Dark Knight Returns, Part 1 3095 Batman: Mask of the Phantasm 10122 Batman Begins Name: title, dtype: object
import numpy as np
def dist(x,y):
return np.sqrt(np.sum((x-y)**2))
doc1 = np.array((2,3,0,1))
doc2 = np.array((1,2,3,1))
doc3 = np.array((2,1,2,2))
docQ = np.array((1,1,0,1))
print(dist(doc1,docQ))
print(dist(doc2,docQ))
print(dist(doc3,docQ))
2.23606797749979 3.1622776601683795 2.449489742783178
# 다음과 같은 두 개의 문서가 있습니다.
# 두 문서 모두에서 등장한 단어는 apple과 banana 2개.
doc1 = "apple banana everyone like likey watch card holder"
doc2 = "apple banana coupon passport love you"
# 토큰화를 수행합니다.
tokenized_doc1 = doc1.split()
tokenized_doc2 = doc2.split()
# 토큰화 결과 출력
print(tokenized_doc1)
print(tokenized_doc2)
['apple', 'banana', 'everyone', 'like', 'likey', 'watch', 'card', 'holder'] ['apple', 'banana', 'coupon', 'passport', 'love', 'you']
union = set(tokenized_doc1).union(set(tokenized_doc2))
print(union)
{'card', 'apple', 'love', 'banana', 'coupon', 'watch', 'passport', 'likey', 'like', 'holder', 'you', 'everyone'}
intersection = set(tokenized_doc1).intersection(set(tokenized_doc2))
print(intersection)
{'apple', 'banana'}
print(len(intersection)/len(union)) # 2를 12로 나눔.
0.16666666666666666
n명의 철학자가 스파게티를 먹고 있다고 가정한다.
CPython에서는 흥미로운 사실이 발견된다.
변수에 값을 대입하는 흔한 식 a = 1 에서 = 연산자는 1이라는 값을 a에다가 할당하겠다는 뜻이다.
네트워크
효율적인 데이터 전송을 위한 장비와 장비간의 연결을 칭함
프로토콜